position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiheadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.0):
        super(MultiheadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout

        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        # Linear layers for query, key, and value
        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        
        # Output projection
        self.out_proj = nn.Linear(embed_dim, embed_dim)

    def forward(self, query, key, value, key_padding_mask=None, need_weights=True, attn_mask=None):
        # Project input tensors to query, key, and value
        query = self.q_proj(query)
        key = self.k_proj(key)
        value = self.v_proj(value)

        # Reshape for multi-head attention
        batch_size, seq_len, embed_dim = query.size()
        query = query.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        key = key.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        value = value.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # Scaled dot-product attention
        attn_output, attn_output_weights = self.scaled_dot_product_attention(query, key, value, attn_mask, key_padding_mask)

        # Concatenate heads and project
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)
        attn_output = self.out_proj(attn_output)

        if need_weights:
            return attn_output, attn_output_weights
        else:
            return attn_output

    def scaled_dot_product_attention(self, query, key, value, attn_mask=None, key_padding_mask=None):
        # Calculate attention scores
        attn_scores = torch.matmul(query, key.transpose(-2, -1)) / (self.head_dim ** 0.5)
        
        if attn_mask is not None:
            attn_scores = attn_scores + attn_mask

        if key_padding_mask is not None:
            attn_scores = attn_scores.masked_fill(key_padding_mask.unsqueeze(1).unsqueeze(2), float('-inf'))

        # Normalize attention scores to probabilities
        attn_weights = F.softmax(attn_scores, dim=-1)

        if self.dropout > 0.0:
            attn_weights = F.dropout(attn_weights, p=self.dropout)

        # Weighted sum of values (Each item of the output corresponds to the weighted sum of V vectors w.r.t. the certain query)
        attn_output = torch.matmul(attn_weights, value)

        return attn_output, attn_weights

class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model=512, nhead=8, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(
            embed_dim=d_model,
            num_heads=nhead,
            batch_first=True  
        )
        # FFN shape of ffn out == shape of ffn in
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        # LayerNorm + Dropout
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        """
        src: (N, S, E)
        src_mask: (S, S) or None
        src_key_padding_mask: (N, S) or None
        """
        attn_out, _ = self.self_attn( # shape of attn_out is the same as src: (N, S, E)
            src, src, src, # let Q=K=V=src（self-attention）
            attn_mask=src_mask,
            key_padding_mask=src_key_padding_mask
        )
        # residual connection: src + attn_out -> norm
        src = self.norm1(src + self.dropout1(attn_out))
        # residual connection: src + ffn_out -> norm
        ffn_out = self.linear2(F.relu(self.linear1(src)))
        out = self.norm2(src + self.dropout2(ffn_out))
        return out

class TransformerDecoderLayer(nn.Module):
    def __init__(self, d_model=512, nhead=8, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        # 1. Masked Self-Attention (for target sequence)
        self.self_attn = nn.MultiheadAttention(
            embed_dim=d_model,
            num_heads=nhead,
            batch_first=True
        )
        # 2. Cross-Attention (attend to encoder output)
        self.multihead_attn = nn.MultiheadAttention(
            embed_dim=d_model,
            num_heads=nhead,
            batch_first=True
        )
        # 3. Feed-Forward Network (same as in encoder)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        # LayerNorm + Dropout
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

    def forward(
        self,
        tgt,                        # (N, T, E)
        memory,                     # (N, S, E)
        tgt_mask=None,              # (T, T)
        memory_mask=None,           # (T, S)
        tgt_key_padding_mask=None,  # (N, T)
        memory_key_padding_mask=None# (N, S)
    ):
        """
        tgt: target sequence (N, T, E)
        memory: encoder output (N, S, E)
        tgt_mask: look-ahead mask for self-attention
        memory_mask: optional mask for encoder-decoder attention
        """
        # --- 1. Masked Self-Attention ---
        tgt2, _ = self.self_attn(
            tgt, tgt, tgt,
            attn_mask=tgt_mask,
            key_padding_mask=tgt_key_padding_mask
        )
        tgt = self.norm1(tgt + self.dropout1(tgt2))

        # --- 2. Cross-Attention (Q from tgt, K/V from encoder memory) ---
        tgt2, _ = self.multihead_attn(
            tgt, memory, memory,
            attn_mask=memory_mask,
            key_padding_mask=memory_key_padding_mask
        )
        tgt = self.norm2(tgt + self.dropout2(tgt2))

        # --- 3. Feed-Forward Network ---
        ffn_out = self.linear2(F.relu(self.linear1(tgt)))
        out = self.norm3(tgt + self.dropout3(ffn_out))
        return out

class Transformer(nn.Module):
    def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        # Encoder: stack of TransformerEncoderLayers
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)

        # Decoder: stack of TransformerDecoderLayers
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)

        self.d_model = d_model
        self.nhead = nhead

    def forward(
        self,
        src,                        # (N, S, E)
        tgt,                        # (N, T, E)
        src_mask=None,              # (S, S)
        tgt_mask=None,              # (T, T)
        memory_mask=None,           # (T, S)
        src_key_padding_mask=None,  # (N, S)
        tgt_key_padding_mask=None,  # (N, T)
        memory_key_padding_mask=None# (N, S)
    ):
        """
        src: source sequence (input)
        tgt: target sequence (output)
        """
        # Encoder: encodes the source sequence
        memory = self.encoder(
            src,
            mask=src_mask,
            src_key_padding_mask=src_key_padding_mask
        )  # (N, S, E)

        # Decoder: generates the output sequence
        out = self.decoder(
            tgt,
            memory,
            tgt_mask=tgt_mask,
            memory_mask=memory_mask,
            tgt_key_padding_mask=tgt_key_padding_mask,
            memory_key_padding_mask=memory_key_padding_mask
        )  # (N, T, E)

        return out

import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# ----------
# Data Preparation
# ----------

# Define a simple dataset
data = [
    ("I am a student", "Je suis un étudiant"),
    ("He is a teacher", "Il est un enseignant"),
    ("She is a nurse", "Elle est une infirmière"),
    ("I love you", "Je t'aime"),
    ("How are you?", "Comment ça va?"),
]

# Build a vocabulary
def build_vocab(sentences):
    vocab = {"<unk>": 0, "<pad>": 1, "<bos>": 2, "<eos>": 3}
    for sentence in sentences:
        for word in sentence.split():
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab

src_sentences = [pair[0] for pair in data]
tgt_sentences = [pair[1] for pair in data]

src_vocab = build_vocab(src_sentences)
tgt_vocab = build_vocab(tgt_sentences)
print("SRC_VOCAB_SIZE:", len(src_vocab))
print("TGT_VOCAB_SIZE ", len(tgt_vocab))
print("src_vocab: ", src_vocab)
print("tgt_vocab: ", tgt_vocab)

# Convert sentences to tensors
def sentence_to_tensor(sentence, vocab):
    return torch.tensor([vocab[word] for word in sentence.split()], dtype=torch.long)

class TranslationDataset(Dataset):
    def __init__(self, data, src_vocab, tgt_vocab):
        self.data = data
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src, tgt = self.data[idx]
        src_tensor = sentence_to_tensor(src, self.src_vocab)
        tgt_tensor = sentence_to_tensor(tgt, self.tgt_vocab)
        return src_tensor, tgt_tensor

# DataLoader
BATCH_SIZE = 2
PAD_IDX = src_vocab["<pad>"]

def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(torch.cat([torch.tensor([src_vocab["<bos>"]]), src_sample, torch.tensor([src_vocab["<eos>"]])]))
        tgt_batch.append(torch.cat([torch.tensor([tgt_vocab["<bos>"]]), tgt_sample, torch.tensor([tgt_vocab["<eos>"]])]))
    src_batch = nn.utils.rnn.pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = nn.utils.rnn.pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

train_loader = DataLoader(TranslationDataset(data, src_vocab, tgt_vocab), batch_size=BATCH_SIZE, collate_fn=collate_fn)

print("Raw data before padding: ")
for src, tgt in data:
    src_tensor = sentence_to_tensor(src, src_vocab)
    tgt_tensor = sentence_to_tensor(tgt, tgt_vocab)
    print("src_tensor: ", src_tensor)
    print("tgt_tensor: ", tgt_tensor)

SRC_VOCAB_SIZE: 18
TGT_VOCAB_SIZE  18
src_vocab:  {'<unk>': 0, '<pad>': 1, '<bos>': 2, '<eos>': 3, 'I': 4, 'am': 5, 'a': 6, 'student': 7, 'He': 8, 'is': 9, 'teacher': 10, 'She': 11, 'nurse': 12, 'love': 13, 'you': 14, 'How': 15, 'are': 16, 'you?': 17}
tgt_vocab:  {'<unk>': 0, '<pad>': 1, '<bos>': 2, '<eos>': 3, 'Je': 4, 'suis': 5, 'un': 6, 'étudiant': 7, 'Il': 8, 'est': 9, 'enseignant': 10, 'Elle': 11, 'une': 12, 'infirmière': 13, "t'aime": 14, 'Comment': 15, 'ça': 16, 'va?': 17}
Raw data before padding: 
src_tensor:  tensor([4, 5, 6, 7])
tgt_tensor:  tensor([4, 5, 6, 7])
src_tensor:  tensor([ 8,  9,  6, 10])
tgt_tensor:  tensor([ 8,  9,  6, 10])
src_tensor:  tensor([11,  9,  6, 12])
tgt_tensor:  tensor([11,  9, 12, 13])
src_tensor:  tensor([ 4, 13, 14])
tgt_tensor:  tensor([ 4, 14])
src_tensor:  tensor([15, 16, 17])
tgt_tensor:  tensor([15, 16, 17])

# ----------
# Model Architecture
# ----------

class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, nhead=8, num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=512, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.d_model = d_model
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = self._generate_positional_encoding(d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout)
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
    
    def _generate_positional_encoding(self, d_model, max_len=5000):
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(1)  # Add batch dimension
        return pe

    def forward(self, src, tgt):
        src = self.src_embedding(src) * math.sqrt(self.d_model)
        tgt = self.tgt_embedding(tgt) * math.sqrt(self.d_model)
        
        # Apply positional encoding
        src = src + self.positional_encoding[:src.size(0), :]
        tgt = tgt + self.positional_encoding[:tgt.size(0), :]
        
        output = self.transformer(src, tgt)
        final_layer = self.fc_out(output)
        return final_layer

# ----------
# Training
# ----------

# Training configuration
SRC_VOCAB_SIZE = len(src_vocab)
TGT_VOCAB_SIZE = len(tgt_vocab)
model = TransformerModel(SRC_VOCAB_SIZE, TGT_VOCAB_SIZE)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# Training loop
NUM_EPOCHS = 10

for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    for src_batch, tgt_batch in train_loader:
        print("src, tgt: ", src_batch.shape, tgt_batch.shape) # src and tgt after padding
        optimizer.zero_grad()

        # Remove last token from tgt_batch for input
        tgt_input = tgt_batch[:-1, :]
        
        # Forward pass
        output = model(src_batch, tgt_input)

        # Remove the first token from tgt_batch for target
        tgt_out = tgt_batch[1:, :].reshape(-1)

        # Calculate loss
        loss = criterion(output.reshape(-1, output.shape[-1]), tgt_out)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch + 1}/{NUM_EPOCHS}, Loss: {total_loss / len(train_loader):.4f}')

/Users/yy544/.local/share/virtualenvs/Cookbook-0P5uvQVm/lib/python3.10/site-packages/torch/nn/modules/transformer.py:307: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)
  warnings.warn(f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}")

src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([5, 1]) torch.Size([5, 1])
Epoch 1/10, Loss: 3.0812
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([5, 1]) torch.Size([5, 1])
Epoch 2/10, Loss: 2.2407
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([5, 1]) torch.Size([5, 1])
Epoch 3/10, Loss: 1.8884
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([5, 1]) torch.Size([5, 1])
Epoch 4/10, Loss: 1.3930
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([5, 1]) torch.Size([5, 1])
Epoch 5/10, Loss: 1.2496
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([5, 1]) torch.Size([5, 1])
Epoch 6/10, Loss: 0.8894
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([5, 1]) torch.Size([5, 1])
Epoch 7/10, Loss: 0.5083
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([5, 1]) torch.Size([5, 1])
Epoch 8/10, Loss: 0.3647
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([5, 1]) torch.Size([5, 1])
Epoch 9/10, Loss: 0.2143
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([6, 2]) torch.Size([6, 2])
src, tgt:  torch.Size([5, 1]) torch.Size([5, 1])
Epoch 10/10, Loss: 0.1677

# ----------
# Inference
# ----------

def greedy_decode(model, src, max_len, start_symbol):
    src = src.unsqueeze(1)
    src_mask = model.transformer.generate_square_subsequent_mask(src.size(0)).type(torch.bool)
    memory = model.transformer.encoder(model.src_embedding(src) + model.positional_encoding[:src.size(0), :])
    
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long) # Start with <bos>, predict the next token iteratively until <eos>
    for i in range(max_len-1):
        tgt_mask = model.transformer.generate_square_subsequent_mask(ys.size(0)).type(torch.bool)
        out = model.transformer.decoder(model.tgt_embedding(ys) + model.positional_encoding[:ys.size(0), :], memory, tgt_mask=tgt_mask)
        out = model.fc_out(out)
        _, next_word = torch.max(out[-1, :], dim=1) # Next word is at the last idx of the output. The input and output have same shape. We can regard the output as shift one step of input.
        next_word = next_word.item()

        # The entire sequence of generated tokens up to the newly generated next_word as Q for the transformer decoder
        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == tgt_vocab['<eos>']:
            break
    return ys

# Translate a sentence
model.eval()
src_sentence = "I love you"
src_tensor = torch.tensor([src_vocab["<bos>"]] + [src_vocab[word] for word in src_sentence.split()] + [src_vocab["<eos>"]])

# Generate translation
translated_sentence = greedy_decode(model, src_tensor, max_len=10, start_symbol=tgt_vocab['<bos>'])

# Convert tokens back to words
translated_words = [list(tgt_vocab.keys())[list(tgt_vocab.values()).index(idx)] for idx in translated_sentence]
print(" ".join(translated_words))

<bos> Je t'aime <eos>

Parameter	Meaning	Role / Corresponding Transformer Component
d_model	The main dimensionality of the Transformer	The dimension of each token embedding, and also the size of Q, K, V vectors after linear projection.
nhead	Number of attention heads	Defines how many parallel attention heads are used in the Multi-Head Attention mechanism.
num_encoder_layers	Number of encoder layers	Determines how many `TransformerEncoderLayer`s are stacked to form the encoder.
num_decoder_layers	Number of decoder layers	Determines how many `TransformerDecoderLayer`s are stacked to form the decoder.
dim_feedforward	Hidden dimension in the feed-forward network (FFN)	Typically 4× larger than `d_model`; controls the capacity of the FFN sub-layer.

--- Positional Encoding¶

Implementation Highlights¶

--- Self-Attention¶

Self-Attention Mechanism¶

--- Multi-head Attention¶

Compute Multi-head Attention¶

Example Code of Multi-head Attention¶

--- Transformers¶

Details of `nn.TransformerEncoderLayer` and `nn.TransformerEncoder`¶

Details of `nn.TransformerDecoderLayer` and `nn.TransformerDecoder`¶

Details of `nn.Transformer`¶

--- Implementation: Using Transformer for Translation (Seq2Seq, Encoder and Decoder)¶

Module	Input Shape	Output Shape
TransformerEncoderLayer	(N, S, E)	(N, S, E)
TransformerEncoder	(N, S, E)	(N, S, E)

Module	Input Shape	Output Shape
TransformerDecoderLayer	(N, T, E)	(N, T, E)
TransformerDecoder	(N, T, E)	(N, T, E)

--- Positional Encoding¶

Implementation Highlights¶

--- Self-Attention¶

Self-Attention Mechanism¶

--- Multi-head Attention¶

Compute Multi-head Attention¶

Example Code of Multi-head Attention¶

--- Transformers¶

--- nn.Transformer Related Class¶

Details of nn.TransformerEncoderLayer and nn.TransformerEncoder¶

Details of nn.TransformerDecoderLayer and nn.TransformerDecoder¶

Details of nn.Transformer¶

--- Implementation: Using Transformer for Translation (Seq2Seq, Encoder and Decoder)¶

--- `nn.Transformer` Related Class¶

Details of `nn.TransformerEncoderLayer` and `nn.TransformerEncoder`¶

Details of `nn.TransformerDecoderLayer` and `nn.TransformerDecoder`¶

Details of `nn.Transformer`¶