import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as pl
from matplotlib import rcParams

# 设置中文字体（macOS 常用）
rcParams['font.sans-serif'] = ['Arial Unicode MS']  
rcParams['axes.unicode_minus'] = False  # 避免负号显示问题

# -------------------------------
# Model
# -------------------------------

class Encoder(nn.Module):
    def __init__(self, embed_dim, hidden_dim):
        super().__init__()
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True) 

    def forward(self, x_emb): # x_emb: (batch_size, seq_len, embed_dim)
        return self.rnn(x_emb) # out, h_T. the shape of out is (batch_size, seq_len, hidden_dim)


class AdditiveAttention(nn.Module):
    def __init__(self, hidden_dim, attn_dim):
        super().__init__()
        self.W_h = nn.Linear(hidden_dim, attn_dim)
        self.W_q = nn.Linear(hidden_dim, attn_dim)
        self.v = nn.Linear(attn_dim, 1, bias=False)

    def forward(self, decoder_state, encoder_outputs):
        # decoder_state: (batch_size, hidden_dim); encoder_outputs: (batch_size, seq_len, hidden_dim)
        seq_len = encoder_outputs.size(1) 
        decoder_exp = decoder_state.unsqueeze(1).expand(-1, seq_len, -1) # (batch_size, seq_len, hidden_dim) repeat query to match the shape of keys
        energy = torch.tanh(self.W_h(encoder_outputs) + self.W_q(decoder_exp))  # (batch_size, seq_len, attn_dim)
        scores = self.v(energy).squeeze(-1) # attention score (batch_size, seq_len) 
        weights = F.softmax(scores, dim=-1) # attention weights (batch_size, seq_len)
        context = torch.bmm(weights.unsqueeze(1), encoder_outputs).squeeze(1) # context vector (batch_size, hidden_dim)
        return context, weights


class Decoder(nn.Module):
    def __init__(self, embed_dim, hidden_dim, attn_dim, vocab_size):
        super().__init__()
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.attn = AdditiveAttention(hidden_dim, attn_dim)
        self.out = nn.Linear(hidden_dim * 2, vocab_size)

    def forward(self, input_emb, hidden, encoder_outputs): # input_emb: (batch_size, 1, embed_dim); hidden: (batch_size, 1, hidden_dim); encoder_outputs: (batch_size, seq_len, hidden_dim)
        # For seq2seq models, decoder takes a single input token at a time (usually the output of the previous time step)
        # encoder_outputs contains the hidden states of the encoder at ALL time steps. To compute the context vector.
        rnn_out, hidden = self.rnn(input_emb, hidden)
        dec_state = rnn_out.squeeze(1)
        context, attn_weights = self.attn(dec_state, encoder_outputs)
        combined = torch.cat([dec_state, context], dim=-1)  # (batch_size, dec_hidden_dim + enc_hidden_dim)
        logits = self.out(combined) # (batch_size, vocab_size)
        return logits, hidden, attn_weights

# -------------------------------
# Data Preparation
# -------------------------------

# Define vocabulary and create mapping dictionaries
# vocab: List of all tokens including special tokens (<pad>, <sos>, <eos>) and words in both languages
# word2idx: Maps each word to a unique integer index for model input
#   e.g. {"<pad>": 0, "<sos>": 1, "<eos>": 2, "the": 3, "cat": 4, ...}
# idx2word: Reverse mapping from indices back to words for model output
#   e.g. {0: "<pad>", 1: "<sos>", 2: "<eos>", 3: "the", 4: "cat", ...}
vocab = ["<pad>", "<sos>", "<eos>", "the", "cat", "sits", "猫", "坐", "在"]
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}

# Embeddings (3D toy embeddings)
embedding_matrix = torch.tensor([
    [0.0, 0.0, 0.0],  # <pad>
    [0.5, 0.5, 0.5],  # <sos>
    [0.0, 0.0, 0.0],  # <eos>
    [1.0, 0.0, 1.0],  # the
    [0.0, 1.0, 0.0],  # cat
    [1.0, 1.0, 0.0],  # sits
    [1.0, 0.5, 0.0],  # 猫
    [0.0, 1.0, 1.0],  # 坐
    [1.0, 0.0, 0.5],  # 在
])
embedding_layer = nn.Embedding.from_pretrained(embedding_matrix)


input_tokens = ["the", "cat", "sits"]
target_tokens = ["<sos>", "猫", "坐", "在"]  # remove <eos> for simplicity

# Convert tokens to indices
# input_ids:  tensor([3, 4, 5])     # "the", "cat", "sits"
# target_ids: tensor([1, 6, 7, 8])  # "<sos>", "猫", "坐", "在"
input_ids = torch.tensor([word2idx[w] for w in input_tokens])
target_ids = torch.tensor([word2idx[w] for w in target_tokens])

# -------------------------------
# Training Configuration
# -------------------------------

# Hyperparameters
embed_dim = 3
hidden_dim = 2
attn_dim = 2
vocab_size = len(vocab)
lr = 0.01
epochs = 300

encoder = Encoder(embed_dim, hidden_dim)
decoder = Decoder(embed_dim, hidden_dim, attn_dim, vocab_size)
params = list(encoder.parameters()) + list(decoder.parameters())
optimizer = torch.optim.Adam(params, lr=lr)
criterion = nn.CrossEntropyLoss()


# -------------------------------
# Training Loop
# -------------------------------

# Training
input_emb = embedding_layer(input_ids).unsqueeze(0) # (1, seq_len, embed_dim)
for epoch in range(epochs):
    optimizer.zero_grad()
    # enc_out is used to compute the context vector; enc_hidden is used to initialize the decoder hidden state to ensure at the start of decoding, the decoder already possesses a high-level semantic representation of the input sentence.
    enc_out, enc_hidden = encoder(input_emb) # enc_out: (1, seq_len, hidden_dim); enc_hidden: (1, 1, hidden_dim)
    dec_hidden = enc_hidden
    loss = 0.0

    for t in range(len(target_ids) - 1):
        # Teacher Forcing: use the ground truth token instead of the predicted token to avoid exposure bias
        # while in inference, we have to use the predicted token to generate the next token
        dec_input_id = target_ids[t].unsqueeze(0)
        dec_input_emb = embedding_layer(dec_input_id).unsqueeze(1) # (1, 1, embed_dim)
        # predict the next token
        logits, dec_hidden, _ = decoder(dec_input_emb, dec_hidden, enc_out)
        loss += criterion(logits, target_ids[t + 1].unsqueeze(0))

    loss.backward()
    optimizer.step()
    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

Epoch 0, Loss: 6.6746
Epoch 50, Loss: 2.5382
Epoch 100, Loss: 1.7085
Epoch 150, Loss: 0.5144
Epoch 200, Loss: 0.2398
Epoch 250, Loss: 0.1537

# -------------------------------
# Inference (greedy decoding)
# -------------------------------

def greedy_decode(encoder, decoder, embedding_layer, input_ids, word2idx, idx2word, max_len=10, verbose=False):
    """
    Encodes the input sequence once with the encoder, then lets the decoder generate tokens 
    step by step by always selecting the most probable word (greedy), until <eos> is produced 
    or the maximum length is reached.

    Args:
        encoder (nn.Module): The encoder model (usually an RNN).
        decoder (nn.Module): The decoder model (with attention mechanism).
        embedding_layer (nn.Embedding): Embedding lookup layer for token IDs.
        input_ids (Tensor): Token IDs of the input sentence 
            (e.g., [3, 4, 5] corresponds to "the cat sits").
        word2idx (dict): Mapping from tokens to IDs.
        idx2word (dict): Mapping from IDs to tokens.
        max_len (int, optional): Maximum length of the generated sequence. Defaults to 10.
        verbose (bool, optional): If True, print each decoding step along with attention weights.
            Defaults to False.

    Returns:
        decoded_tokens (list of str): The generated output tokens.
        attention_maps (list of list[float]): Attention weights at each decoding step.
    """
    
    input_emb = embedding_layer(input_ids).unsqueeze(0)
    encoder_outputs, encoder_hidden = encoder(input_emb)
    decoder_hidden = encoder_hidden
    decoder_input = torch.tensor([word2idx["<sos>"]])
    decoded_tokens = []
    attention_maps = []

    for t in range(max_len):
        decoder_input_emb = embedding_layer(decoder_input).unsqueeze(1)
        logits, decoder_hidden, attn_weights = decoder(decoder_input_emb, decoder_hidden, encoder_outputs)
        pred_id = logits.argmax(dim=-1).item()
        pred_token = idx2word[pred_id]
        if verbose:
            print(f"[Step {t}] {pred_token}, attn: {attn_weights.squeeze().tolist()}")
        attention_maps.append(attn_weights.squeeze().tolist())
        if pred_token == "<eos>":
            break
        decoded_tokens.append(pred_token)
        decoder_input = torch.tensor([pred_id])

    return decoded_tokens, attention_maps

decoded, attn = greedy_decode(
    encoder, decoder, embedding_layer, input_ids, # input_ids:  tensor([3, 4, 5])     # "the", "cat", "sits"
    word2idx, idx2word, verbose=True
)
print("Generated:", decoded)

[Step 0] 猫, attn: [0.3320370614528656, 0.33273470401763916, 0.33522823452949524]
[Step 1] 坐, attn: [0.33232754468917847, 0.33286750316619873, 0.3348049521446228]
[Step 2] 在, attn: [0.33323949575424194, 0.33328646421432495, 0.3334740698337555]
[Step 3] 在, attn: [0.33301979303359985, 0.3331827223300934, 0.3337974548339844]
[Step 4] 坐, attn: [0.33262696862220764, 0.33300304412841797, 0.3343698978424072]
[Step 5] 在, attn: [0.33322328329086304, 0.3332785665988922, 0.33349815011024475]
[Step 6] 在, attn: [0.33288416266441345, 0.33312007784843445, 0.3339958190917969]
[Step 7] 坐, attn: [0.33265358209609985, 0.3330153524875641, 0.33433106541633606]
[Step 8] 在, attn: [0.3332301676273346, 0.333281934261322, 0.3334878385066986]
[Step 9] 在, attn: [0.3329254686832428, 0.3331390917301178, 0.333935409784317]
Generated: ['猫', '坐', '在', '在', '坐', '在', '在', '坐', '在', '在']

# -------------------------------
# Plot Attention Heatmap for Different Queries
# -------------------------------

def plot_attention(attn, input_tokens, output_tokens):
    fig, ax = plt.subplots()
    im = ax.imshow(attn, cmap="Blues")
    ax.set_xticks(range(len(input_tokens)))
    ax.set_xticklabels(input_tokens)
    ax.set_yticks(range(len(output_tokens)))
    ax.set_yticklabels(output_tokens)
    ax.set_xlabel("Input")
    ax.set_ylabel("Output")
    ax.set_title("Attention Heatmap")
    plt.colorbar(im)
    plt.show()

plot_attention(attn, input_tokens, decoded)

--- Key Concepts¶

--- Seq2Seq Attention Model with Additive Alignment¶

Step 1: Encoder Hidden States¶

Step 2: Decoder Hidden State (Query)¶

Step 3: Compute Alignment Scores (Additive Attention)¶

Step 4: Apply Softmax to Alignment Scores¶

Step 5: Compute Context Vector¶

Step 5: Where Does the Context Vector Go?¶

Recap¶

--- Seq2Seq Attention Model with Additive Alignment: Implementation¶

-- Other Options of Query, Key and Value¶

How to Choose Query¶

How to Choose Key and Value¶