refactoring code structure

Huffon · Huffon · commit 09b99278dbd9 · 2019-10-02T00:08:51.000+09:00
diff --git a/config/params.json b/config/params.json
@@ -16,5 +16,6 @@
     "feed_forward_dim": 1024,
     "n_layer": 6,
     "n_head": 8,
+    "max_len": 20,
     "dropout": 0.1
 }
diff --git a/model/attention.py b/model/attention.py
@@ -1,6 +1,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import numpy as np
 
 
 class MultiHeadAttention(nn.Module):
@@ -59,7 +60,7 @@ def forward(self, query, key, value, mask=None):
         # self_attention = [batch size, sentence length, sentence length]
 
         if mask is not None:
-            self_attention = self_attention.masked_fill(mask, -1e10)
+            self_attention = self_attention.masked_fill(mask, -np.inf)
 
         # normalize self attention score by applying soft max function on each row
         attention_score = self.dropout(F.softmax(self_attention, dim=-1))
diff --git a/model/decoder.py b/model/decoder.py
@@ -3,6 +3,7 @@
 
 from model.attention import MultiHeadAttention
 from model.positionwise import PositionWiseFeedForward
+from model.ops import create_positional_encoding, create_non_pad_mask, create_subsequent_mask, create_target_mask
 
 
 class DecoderLayer(nn.Module):
@@ -39,36 +40,35 @@ class Decoder(nn.Module):
     def __init__(self, params):
         super(Decoder, self).__init__()
         self.device = params.device
+        self.hidden_dim = params.hidden_dim
 
-        self.token_embedding = nn.Embedding(params.output_dim, params.hidden_dim)
+        self.token_embedding = nn.Embedding(params.output_dim, params.hidden_dim, padding_idx=params.pad_idx)
         self.decoder_layers = nn.ModuleList([DecoderLayer(params) for _ in range(params.n_layer)])
         self.fc = nn.Linear(params.hidden_dim, params.output_dim)
 
         self.dropout = nn.Dropout(params.dropout)
         self.scale = torch.sqrt(torch.FloatTensor([params.hidden_dim])).to(self.device)
 
-    def forward(self, target, encoder_output, target_mask, dec_enc_mask, positional_encoding, target_non_pad):
+    def forward(self, target, source, encoder_output):
         # target              = [batch size, target length]
+        # source              = [batch size, source length]
         # encoder_output      = [batch size, source length, hidden dim]
+        target_batch, target_len = target.size()
 
-        # target_mask         = [batch size, target length, target length]
-        # dec_enc_mask        = [batch size, target length, source length]
-        # positional_encoding = [batch size, target length, hidden dim]
+        subsequent_mask = create_subsequent_mask(target)
+        target_mask, dec_enc_mask = create_target_mask(source, target, subsequent_mask)
+        # target_mask  = [batch size, target length, target length]
+        # dec_enc_mask = [batch size, target length, source length]
+        target_non_pad = create_non_pad_mask(target)  # [batch size, target length, 1]
 
-        # target_non_pad      = [batch size, target length, 1]
-
-        # print(f'[D] Before embedding: {target.shape}')
         embedded = self.token_embedding(target)
-        # print(f'[D] Before embedding: {embedded.shape}')
-
+        positional_encoding = create_positional_encoding(target_batch, target_len, self.hidden_dim)
         target = self.dropout(embedded + positional_encoding)
 
         for decoder_layer in self.decoder_layers:
             target = decoder_layer(target, encoder_output, target_mask, dec_enc_mask, target_non_pad)
         # target = [batch size, target length, hidden dim]
-        # print(f'[D] After decoding: {target.shape}')
+
         output = self.fc(target)
         # output = [batch size, target length, output dim]
-        # print(f'[D] After predicting: {output.shape}')
-        # print('------------------------------------------------------------')
         return output
diff --git a/model/encoder.py b/model/encoder.py
@@ -3,13 +3,13 @@
 
 from model.attention import MultiHeadAttention
 from model.positionwise import PositionWiseFeedForward
+from model.ops import create_positional_encoding, create_non_pad_mask, create_source_mask
 
 
 class EncoderLayer(nn.Module):
     def __init__(self, params):
         super(EncoderLayer, self).__init__()
         self.layer_norm = nn.LayerNorm(params.hidden_dim)
-
         self.self_attention = MultiHeadAttention(params)
         self.position_wise_ffn = PositionWiseFeedForward(params)
 
@@ -33,29 +33,26 @@ class Encoder(nn.Module):
     def __init__(self, params):
         super(Encoder, self).__init__()
         self.device = params.device
+        self.hidden_dim = params.hidden_dim
 
-        self.token_embedding = nn.Embedding(params.input_dim, params.hidden_dim)
+        self.token_embedding = nn.Embedding(params.input_dim, params.hidden_dim, padding_idx=params.pad_idx)
         self.encoder_layers = nn.ModuleList([EncoderLayer(params) for _ in range(params.n_layer)])
         self.dropout = nn.Dropout(params.dropout)
         self.scale = torch.sqrt(torch.FloatTensor([params.hidden_dim])).to(self.device)
 
-    def forward(self, source, source_mask, positional_encoding, source_non_pad):
-        # source              = [batch size, source length]
-        # source_mask         = [batch size, source length, source length]
-        # positional_encoding = [batch size, source length, hidden dim]
-        # source_non_pad      = [batch size, source length, 1]
+    def forward(self, source):
+        # source = [batch size, source length]
+        source_batch, source_len = source.size()
 
-        # define positional encoding which encodes token's positional information
-        # print(f'[E] Before embedding: {source.shape}')
-        embedded = self.token_embedding(source)
-        # print(f'[E] After embedding: {embedded.shape}')
+        source_mask = create_source_mask(source)      # [batch size, source length, source length]
+        source_non_pad = create_non_pad_mask(source)  # [batch size, source length, 1]
 
+        embedded = self.token_embedding(source)
+        positional_encoding = create_positional_encoding(source_batch, source_len, self.hidden_dim)
         source = self.dropout(embedded + positional_encoding)
         # source = [batch size, source length, hidden dim]
 
         for encoder_layer in self.encoder_layers:
             source = encoder_layer(source, source_mask, source_non_pad)
         # source = [batch size, source length, hidden dim]
-        # print(f'[E] After encoding: {source.shape}')
-        # print('------------------------------------------------------------')
         return source
diff --git a/model/ops.py b/model/ops.py
@@ -0,0 +1,112 @@
+import pickle
+import numpy as np
+import torch
+
+pickle_eng = open('pickles/eng.pickle', 'rb')
+eng = pickle.load(pickle_eng)
+pad_idx = eng.vocab.stoi['<pad>']
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+
+def create_subsequent_mask(target):
+    # target = [batch size, target length]
+    batch_size, target_length = target.size()
+    '''
+    if target length is 5 and diagonal is 1, this function returns
+        [[0, 1, 1, 1, 1],
+         [0, 0, 1, 1, 1],
+         [0, 0, 0, 1, 1],
+         [0, 0, 0, 1, 1],
+         [0, 0, 0, 0, 1]]
+    '''
+    # torch.triu returns the upper triangular part of a matrix based on user defined diagonal
+    subsequent_mask = torch.triu(torch.ones(target_length, target_length), diagonal=1).bool().to(device)
+    # subsequent_mask = [target length, target length]
+
+    # repeat subsequent mask 'batch size' times to cover all data instances in the batch
+    subsequent_mask = subsequent_mask.unsqueeze(0).repeat(batch_size, 1, 1)
+    # subsequent_mask = [batch size, target length, target length]
+
+    return subsequent_mask
+
+
+def create_source_mask(source):
+    '''
+    create masking tensor for encoder's self attention
+    if sentence is [2, 193, 9, 27, 10003, 1, 1, 1, 3] and 2 denotes <sos>, 3 denotes <eos> and 1 denotes <pad>
+    masking tensor will be [False, False, False, False, False, True, True, True, False]
+    :param source: [batch size, source length]
+    :return: source mask
+    '''
+    source_length = source.shape[1]
+
+    # create boolean tensors which will be used to mask padding tokens of both source and target sentence
+    source_mask = (source == pad_idx)
+    # source_mask = [batch size, source length]
+
+    # repeat sentence masking tensors 'sentence length' times
+    source_mask = source_mask.unsqueeze(1).repeat(1, source_length, 1)
+    # source_mask = [batch size, source length, source length]
+
+    return source_mask
+
+
+def create_target_mask(source, target, subsequent_mask):
+    '''
+    create masking tensor for decoder's self attention and decoder's attention on the output of encoder
+    if sentence is [2, 193, 9, 27, 10003, 1, 1, 1, 3] and 2 denotes <sos>, 3 denotes <eos> and 1 denotes <pad>
+    masking tensor will be [False, False, False, False, False, True, True, True, False]
+    :param source: [batch size, source length]
+    :param target: [batch size, target length]
+    :param subsequent_mask: [batch size, target length, target length]
+    :return:
+    '''
+    target_length = target.shape[1]
+
+    # create boolean tensors which will be used to mask padding tokens of both source and target sentence
+    source_mask = (source == pad_idx)
+    target_mask = (target == pad_idx)
+    # target_mask    = [batch size, target length]
+
+    # repeat sentence masking tensors 'sentence length' times
+    dec_enc_mask = source_mask.unsqueeze(1).repeat(1, target_length, 1)
+    target_mask = target_mask.unsqueeze(1).repeat(1, target_length, 1)
+
+    # dec_enc_mask   = [batch size, target length, source length]
+    # target_mask    = [batch size, target length, target length]
+
+    # combine <pad> token masking tensor and subsequent masking tensor for decoder's self attention
+    target_mask = target_mask | subsequent_mask
+    # target_mask = [batch size, target length, target length]
+
+    return target_mask, dec_enc_mask
+
+
+def create_non_pad_mask(sentence):
+    '''
+    create non-pad masking tensor which will be used to extract non-padded tokens from output
+    if sentence is [2, 193, 9, 27, 1, 1, 1, 3]
+    this function returns [[1], [1], [1], [1], [0], [0], [0], [1]]
+    '''
+    return sentence.ne(pad_idx).type(torch.float).unsqueeze(-1)
+
+
+def create_positional_encoding(batch_size, sentence_len, hidden_dim):
+    # PE(pos, 2i)     = sin(pos/10000 ** (2*i / hidden_dim))
+    # PE(pos, 2i + 1) = cos(pos/10000 ** (2*i / hidden_dim))
+    sinusoid_table = np.array([pos / np.power(10000, 2 * i / hidden_dim)
+                               for pos in range(sentence_len) for i in range(hidden_dim)])
+    # sinusoid_table = [sentence length * hidden dim]
+
+    sinusoid_table = sinusoid_table.reshape(sentence_len, -1)
+    # sinusoid_table = [sentence length, hidden dim]
+
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # calculate pe for even dimension
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # calculate pe for odd dimension
+
+    # convert numpy based sinusoid table to torch.tensor and repeat it 'batch size' times
+    sinusoid_table = torch.FloatTensor(sinusoid_table).to(device)
+    sinusoid_table = sinusoid_table.unsqueeze(0).repeat(batch_size, 1, 1)
+    # sinusoid_table = [batch size, sentence length, hidden dim]
+
+    return sinusoid_table
diff --git a/model/transformer.py b/model/transformer.py
@@ -1,5 +1,3 @@
-import numpy as np
-import torch
 import torch.nn as nn
 
 from model.encoder import Encoder
@@ -9,113 +7,14 @@
 class Transformer(nn.Module):
     def __init__(self, params):
         super(Transformer, self).__init__()
-        self.params = params
-        self.hidden_dim = params.hidden_dim
-
-        self.device = params.device
         self.encoder = Encoder(params)
         self.decoder = Decoder(params)
 
-    def create_subsequent_mask(self, target):
-        # target = [batch size, target length]
-
-        batch_size, target_length = target.size()
-        '''
-        if target length is 5 and diagonal is 1, this function returns
-            [[0, 1, 1, 1, 1],
-             [0, 0, 1, 1, 1],
-             [0, 0, 0, 1, 1],
-             [0, 0, 0, 0, 1],
-             [0, 0, 0, 0, 1]]
-        '''
-        # torch.triu returns the upper triangular part of a matrix based on user defined diagonal
-        subsequent_mask = torch.triu(torch.ones(target_length, target_length), diagonal=1).bool().to(self.device)
-        # subsequent_mask = [target length, target length]
-
-        # repeat subsequent mask 'batch size' times to cover all data instances in the batch
-        subsequent_mask = subsequent_mask.unsqueeze(0).repeat(batch_size, 1, 1)
-        # subsequent_mask = [batch size, target length, target length]
-
-        return subsequent_mask
-
-    def create_mask(self, source, target, subsequent_mask):
-        # source          = [batch size, source length]
-        # target          = [batch size, target length]
-        # subsequent_mask = [batch size, target length, target length]
-        source_length = source.shape[1]
-        target_length = target.shape[1]
-
-        # create boolean tensors which will be used to mask padding tokens of both source and target sentence
-        source_mask = (source == self.params.pad_idx)
-        target_mask = (target == self.params.pad_idx)
-        # source_mask    = [batch size, source length]
-        # target_mask    = [batch size, target length]
-        '''
-        if sentence is [2, 193, 9, 27, 10003, 1, 1, 1, 3] and 2 denotes <sos>, 3 denotes <eos> and 1 denotes <pad>
-        masking tensor will be [False, False, False, False, False, True, True, True, False]
-        '''
-        # repeat sentence masking tensors 'sentence length' times
-        dec_enc_mask = source_mask.unsqueeze(1).repeat(1, target_length, 1)
-        source_mask = source_mask.unsqueeze(1).repeat(1, source_length, 1)
-        target_mask = target_mask.unsqueeze(1).repeat(1, target_length, 1)
-
-        # source_mask    = [batch size, source length, source length]
-        # target_mask    = [batch size, target length, target length]
-        # dec_enc_mask   = [batch size, target length, source length]
-
-        # combine <pad> token masking tensor and subsequent masking tensor for decoder's self attention
-        target_mask = target_mask | subsequent_mask
-        # target_mask = [batch size, target length, target length]
-
-        return source_mask, target_mask, dec_enc_mask
-
-    def create_non_pad_mask(self, sentence):
-        # padding token shouldn't be used for the output tensor
-        # to use only non padding token, create non-pad masking tensor
-        return sentence.ne(self.params.pad_idx).type(torch.float).unsqueeze(-1)
-
-    def create_positional_encoding(self, batch_size, sentence_len):
-        # PE(pos, 2i)     = sin(pos/10000 ** (2*i / hidden_dim)
-        # PE(pos, 2i + 1) = cos(pos/10000 ** (2*i / hidden_dim)
-        sinusoid_table = np.array([pos/np.power(10000, 2*i/self.hidden_dim)
-                                   for pos in range(sentence_len) for i in range(self.hidden_dim)])
-        # sinusoid_table = [sentence length * hidden dim]
-
-        sinusoid_table = sinusoid_table.reshape(sentence_len, -1)
-        # sinusoid_table = [sentence length, hidden dim]
-
-        sinusoid_table[0::2, :] = np.sin(sinusoid_table[0::2, :])  # calculate pe for even numbers
-        sinusoid_table[1::2, :] = np.sin(sinusoid_table[1::2, :])  # calculate pe for odd numbers
-
-        # convert numpy based sinusoid table to torch.tensor and repeat it 'batch size' times
-        sinusoid_table = torch.FloatTensor(sinusoid_table).to(self.device)
-        sinusoid_table = sinusoid_table.unsqueeze(0).repeat(batch_size, 1, 1)
-        # sinusoid_table = [batch size, sentence length, hidden dim]
-
-        return sinusoid_table
-
     def forward(self, source, target):
         # source = [batch size, source length]
         # target = [batch size, target length]
-        source_batch, source_len = source.size()
-        target_batch, target_len = target.size()
-
-        # create masking tensor for self attention (encoder & decoder) and decoder's attention on the output of encoder
-        subsequent_mask = self.create_subsequent_mask(target)
-        source_mask, target_mask, dec_enc_mask = self.create_mask(source, target, subsequent_mask)
-
-        # create non-pad masking tensor which will be used to extract non-padded tokens from output
-        source_non_pad = self.create_non_pad_mask(source)
-        target_non_pad = self.create_non_pad_mask(target)
-        # non_pad = [batch size, sentence length, 1]
-
-        source_positional_encoding = self.create_positional_encoding(source_batch, source_len)
-        target_positional_encoding = self.create_positional_encoding(target_batch, target_len)
-
-        source = self.encoder(source, source_mask, source_positional_encoding, source_non_pad)
-        output = self.decoder(target, source, target_mask, dec_enc_mask, target_positional_encoding, target_non_pad)
-        # output = [batch size, target length, output dim]
-
+        encoder_output = self.encoder(source)                  # [batch size, source length, hidden dim]
+        output = self.decoder(target, source, encoder_output)  # [batch size, target length, output dim]
         return output
 
     def count_parameters(self):
diff --git a/predict.py b/predict.py
diff --git a/trainer.py b/trainer.py

Original file line number	Diff line number	Diff line change
`@@ -16,5 +16,6 @@`
`16`	`16`	`"feed_forward_dim": 1024,`
`17`	`17`	`"n_layer": 6,`
`18`	`18`	`"n_head": 8,`
	`19`	`+ "max_len": 20,`
`19`	`20`	`"dropout": 0.1`
`20`	`21`	`}`