理解すべきポイント
Model Architecture
Scaled Dot-Product Attention
Multi-Head Attention
class Transformer(nn.Module):
def forward(self, enc_input, dec_input, mask):
enc_output = self.encoder(enc_input)
output = self.decoder(dec_input, enc_output, mask)
return output
class Encoder(nn.Module):
def forward(self, x):
x = self.embed(x) # Input Embedding (nn.Embedding)
x = x * (self.dim**0.5)
x = self.PE(x) # Positional Encoding
x = self.dropout(x)
for i in range(6): # N = 6
x = self.EncoderBlocks[i](x) # Encoder Block
return x
Positional Encoding
$$ \text{PE}(\text{pos}) = \left[\sin (\text{pos}), \cos (\text{pos}), \sin \left(\frac{\text{pos}}{10000^{2/D}}\right), \cos \left(\frac{\text{pos}}{10000^{2/D}}\right), ..., \cos \left(\frac{\text{pos}}{10000^{D - 2/D}}\right)\right] $$
$$ \text{PE} (\text{pos}, 2d + 1) = \cos \left(\frac{\text{pos}}{10000^{2d/D}}\right) $$
$$ \text{PE} (\text{pos}, 2d) = \sin \left(\frac{\text{pos}}{10000^{2d/D}}\right) $$
class PositionalEncoding(nn.Module):
# dim -> D
def __init__(self, dim, dropout=0.1, max_len=5000):
super().__init__()
self.dropout = nn.Dropout(p=dropout)
position = torch.arange(max_len).unsqueeze(1).to(device)
div_term = torch.exp(torch.arange(0, dim, 2) * (-math.log(10000.0) / dim)).to(device)
pe = torch.zeros(max_len, 1, dim).to(device)
pe[:, 0, 0::2] = torch.sin(position * div_term)
pe[:, 0, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0)]
return self.dropout(x)
class EncoderBlock(nn.Module):
def forward(self, x):
Q = K = V = x
_x = x
x = self.MHA(Q, K, V) # Multi-HeadAttention
x = self.dropout_1(x)
x = x + _x
x = self.layer_norm_1(x)
_x = x
x = self.FF(x) # Feed Forward
x = self.dropout_2(x)
x = x + _x
x = self.layer_norm_2(x)
return x
Multi-Head Attention