import math, argparse, os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
torch.set_float32_matmul_precision("high") if hasattr(torch, "set_float32_matmul_precision") else None

# ---------------------------
# Utilities
# ---------------------------
def cosine_beta_schedule(T, s=0.008):
    """
    Example:
        >>> betas = cosine_beta_schedule(5)
        >>> betas
        tensor([0.0064, 0.0243, 0.0630, 0.1310, 0.2250])
    """
    steps = T
    x = torch.linspace(0, T, steps+1, dtype=torch.float32)
    alphas_cumprod = torch.cos(((x/T)+s)/(1+s)*math.pi/2)**2
    alphas_cumprod = alphas_cumprod/alphas_cumprod[0]
    betas = 1 - (alphas_cumprod[1:]/alphas_cumprod[:-1])
    return betas.clamp(1e-5, 0.999)

def t_embed(t, dim=64):
    """
    Map time step t to a vector. Input a number, output a vector.
    Benefits: Removes scale effects; Capture relative relationships; Unifies representation range.
    # Example:
    # Input: t = torch.tensor([1, 10, 100]), dim=8
    # Output: t_embed(t, 8) =>
    # tensor([[ 0.8415,  0.9093,  0.1411,  0.7568,  0.5403, -0.4161, -0.9899,  0.6536],
    #         [ 0.9415,  0.1367,  0.0560,  0.6251,  0.3366,  0.9905, -0.9984,  0.7805],
    #         [ 0.5064, -0.7061, -0.0627,  0.4431, -0.8623,  0.7081, -0.9980,  0.8965]])
    """
    half = dim // 2
    freqs = torch.exp(torch.arange(half, device=t.device)*(-math.log(10000.0)/max(half-1,1)))
    args = t.float().unsqueeze(1)*freqs.unsqueeze(0)
    emb = torch.cat([torch.sin(args), torch.cos(args)], dim=1)
    if dim % 2 == 1: emb = torch.nn.functional.pad(emb, (0,1))
    return emb

# ---------------------------
# Forward and Reverse Process
# ---------------------------
class Diffusion:
    def __init__(self, T=200, beta_schedule="cosine", device="cpu"):
        if beta_schedule == "cosine":
            betas = cosine_beta_schedule(T).to(device)
        else:
            betas = torch.linspace(1e-4, 0.02, T, device=device)
        self.T = T
        self.device = device
        self.betas = betas
        self.alphas = 1. - betas
        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1. - self.alphas_cumprod)
        # series of \bar{a_{t-1}}; let \bar{a_0} = 1. Only used in self.posterior_variance.
        self.alphas_cumprod_prev = torch.cat([torch.tensor([1.], device=device), self.alphas_cumprod[:-1]])
        # The variance of p_theta(x_{t-1} | x_t). Calculated by Gaussian conditioning formula.
        self.posterior_variance = betas * (1. - self.alphas_cumprod_prev) / (1. - self.alphas_cumprod)

    @torch.no_grad()
    def sample(self, model, shape):
        x = torch.randn(shape, device=self.device)
        for t in reversed(range(self.T)): # t: T, T-1, ..., 0
            # duplicate t to (batchsize,), so that each sample corresponds to its own current timestep
            t_cur = torch.full((shape[0],), t, device=self.device, dtype=torch.long)
            eps = model(x, t_cur)
            alpha = self.alphas[t]
            alpha_bar = self.alphas_cumprod[t]
            beta = self.betas[t]
            mean = (1/torch.sqrt(alpha))*(x - (beta/torch.sqrt(1-alpha_bar))*eps)
            if t > 0:
                noise = torch.randn_like(x)
                var = self.posterior_variance[t]
                x = mean + torch.sqrt(var)*noise
            else: # t == 0, last step don't need to add noise. x_0 is exactly the mean of the initial distribution.
                x = mean
        return x

    def q_sample(self, x0, t, noise=None): # Exact one-step forward process. Input t here is a number not an embedding.
        if noise is None: noise = torch.randn_like(x0)
        # self.sqrt_alphas_cumprod[t] has shape (batch,), x0 has shape (batch, xdim)
        # So we add [:, None] → (batch, 1), to enable element-wise multiplication
        return self.sqrt_alphas_cumprod[t][:,None]*x0 + self.sqrt_one_minus_alphas_cumprod[t][:,None]*noise, noise

# ---------------------------
# Simple MLP εθ(x_t, t)
# ---------------------------
class EpsModel(nn.Module):
    def __init__(self, xdim, tdim=64, hidden=256):
        super().__init__()
        self.tproj = nn.Sequential(nn.Linear(tdim, hidden), nn.SiLU()) 
        self.net = nn.Sequential(
            nn.Linear(xdim+hidden, hidden), nn.SiLU(),
            nn.Linear(hidden, hidden), nn.SiLU(),
            nn.Linear(hidden, xdim)
        )
    def forward(self, x, t):
        emb = t_embed(t, dim=self.tproj[0].in_features) # emb shape: (B, dim)
        temb = self.tproj(emb)  # why we need tproj given that t_embed already maps t to a vector
        h = torch.cat([x, temb], dim=1)
        return self.net(h)

# ---------------------------
# Training step (MSE on ε)
# ---------------------------
def train(model, diffusion, loader, epochs=5, lr=2e-4, device="cpu", log_every=200):
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    mse = nn.MSELoss()
    step=0
    model.train()
    for ep in range(1, epochs+1):
        for (x,) in loader:
            x = x.to(device)
            # t is randomly sampled for each data point in the batch.
            # Why random? This follows the DDPM training strategy (Ho et al., 2020),
            # where the model is trained to predict noise at a random timestep t.
            t = torch.randint(0, diffusion.T, (x.size(0),), device=device).long()
            x_t, noise = diffusion.q_sample(x, t)
            pred = model(x_t, t)
            loss = mse(pred, noise)
            opt.zero_grad(); loss.backward(); opt.step()
            if step % log_every == 0:
                print(f"[epoch {ep}] step {step} loss {loss.item():.4f}")
            step += 1

# ---------------------------
# Datasets
# ---------------------------
def make_gaussian(n=20000, mean=0., std=1., device="cpu"):
    x = torch.randn(n, 1)*std + mean
    return TensorDataset(x.to(device))

def make_circle(n=20000, radius=2.0, noise=0.05, device="cpu"):
    angles = torch.rand(n)*2*math.pi
    x = torch.stack([torch.cos(angles), torch.sin(angles)], dim=1)*radius
    x += noise*torch.randn_like(x)
    return TensorDataset(x.to(device))

def make_mnist(device="cpu"):
    # MNIST flattened & normalized to [-1, 1]
    from torchvision import datasets, transforms
    tfm = transforms.Compose([transforms.ToTensor(), transforms.Lambda(lambda x: x*2-1), transforms.Lambda(lambda x: x.view(-1))])
    ds = datasets.MNIST(root="./data", train=True, download=True, transform=tfm)
    X = torch.stack([ds[i][0] for i in range(len(ds))]).to(device)
    return TensorDataset(X)

# ---------------------------
# Training
# ---------------------------
data = "circle"
batch = 256
epochs = 5
T = 200
lr = 2e-4
device = "cuda" if torch.cuda.is_available() else "cpu"

# dataset
if data == "gaussian":
    ds = make_gaussian(device=device); xdim = 1
elif data == "circle":
    ds = make_circle(device=device); xdim = 2
else:
    ds = make_mnist(device=device); xdim = 28*28

dl = DataLoader(ds, batch_size=batch, shuffle=True, drop_last=True)

# diffusion + model
diffusion = Diffusion(T=T, device=device)
model = EpsModel(xdim=xdim).to(device)

# train
train(model, diffusion, dl, epochs=epochs, lr=lr, device=device)

[epoch 1] step 0 loss 0.9971
[epoch 3] step 200 loss 0.5452

# ---------------------------
# Sampling
# ---------------------------
model.eval()
with torch.no_grad():
    samples = diffusion.sample(model, (64, xdim)).cpu()

# save few samples
os.makedirs("samples", exist_ok=True)
if xdim == 1:
    torch.save(samples, "samples/gaussian.pt")
    print("Saved samples to samples/gaussian.pt (tensor of shape [64, 1])")
elif xdim == 2:
    torch.save(samples, "samples/circle.pt")
    print("Saved samples to samples/circle.pt (tensor of shape [64, 2])")
else:
    import torchvision.utils as vutils
    imgs = (samples.view(-1,1,28,28).clamp(-1,1)+1)/2
    grid = vutils.make_grid(imgs, nrow=8)
    vutils.save_image(grid, "samples/mnist.png")
    print("Saved image grid to samples/mnist.png")

Saved samples to samples/circle.pt (tensor of shape [64, 2])

class ToyUNet(nn.Module):
    def __init__(self, in_ch=3, out_ch=2):
        super().__init__()
        # --- Encoder ---
        self.enc1 = nn.Sequential(nn.Conv2d(in_ch, 16, 3, padding=1),nn.ReLU()) # -> x1: [B,16,H,W]
        self.enc2 = nn.Sequential(nn.Conv2d(16, 32, 3, padding=1),nn.ReLU()) # -> x3: [B,32,H/2,W/2]
        # --- Decoder ---
        self.up1 = nn.ConvTranspose2d(32, 16, kernel_size=2, stride=2)
        self.dec1 = nn.Sequential(nn.Conv2d(16 + 32, 16, 3, padding=1),nn.ReLU())
        self.up2 = nn.ConvTranspose2d(16, 8, kernel_size=2, stride=2) 
        self.dec2 = nn.Sequential(nn.Conv2d(8 + 16, 8, 3, padding=1),nn.ReLU())
        self.out_conv = nn.Conv2d(8, out_ch, 1)

    def forward(self, x):
        # --- Encoder ---
        x1 = self.enc1(x)                # [B,16,H,W]
        x2 = F.max_pool2d(x1, 2)         # [B,16,H/2,W/2]
        x3 = self.enc2(x2)               # [B,32,H/2,W/2]
        bottleneck = F.max_pool2d(x3, 2) # [B,32,H/4,W/4]
        # --- Decoder ---
        y = self.up1(bottleneck)        # [B,16,H/2,W/2]
        y = torch.cat([y, x3], dim=1)   # Enc2 → Dec1. shape: [B, 16 + 32, H/2, W/2]
        y = self.dec1(y)                # [B,16,H/2,W/2]

        y = self.up2(y)                 # [B,8,H,W]
        y = torch.cat([y, x1], dim=1)   # Enc1 → Dec2. shape: [B, 8 + 16, H, W]
        y = self.dec2(y)                # [B,8,H,W]

        return self.out_conv(y)

def timestep_embedding(t, dim): # t: [B] (int); return: [B, dim]
    """
    sinusoidal position embedding, freqs = exp(-log(10000) * arange(0, half) / half)    (from DDPM paper Appendix B)
    Remove the scale effect of timestamps like 1, 2, 3, ..., 100, ...
    Allows the network to sense the relationship between different time steps (for example, 1 and 2 are close, 1 and 100 are far apart).
    """
    half = dim // 2
    freqs = torch.exp(-math.log(10000) * torch.arange(0, half, dtype=torch.float32) / half) # freqs shape: (half,)
    args = t[:, None].float() * freqs[None] # args shape: (B, half)
    emb = torch.cat([torch.sin(args), torch.cos(args)], dim=-1) # emb shape: (B, dim)
    return emb

class ToyUNet_DDPM(nn.Module):
    def __init__(self, in_ch=3):
        super().__init__()
        self.in_ch = in_ch
        out_ch = in_ch  # noise prediction → same shape as input
        # --- Time embedding ---
        self.time_mlp = nn.Sequential( # global time embedding. also to match the bottleneck dimension
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32)
        )
        self.time_proj_enc1 = nn.Linear(32, 16)
        self.time_proj_enc2 = nn.Linear(32, 32)
        self.time_proj_dec1 = nn.Linear(32, 16)
        self.time_proj_dec2 = nn.Linear(32, 8)
        # --- Encoder ---
        self.enc1 = nn.Sequential(nn.Conv2d(in_ch, 16, 3, padding=1), nn.ReLU())
        self.enc2 = nn.Sequential(nn.Conv2d(16, 32, 3, padding=1), nn.ReLU())
        # --- Decoder ---
        self.up1 = nn.ConvTranspose2d(32, 16, 2, stride=2)
        self.dec1 = nn.Sequential(nn.Conv2d(16 + 32, 16, 3, padding=1), nn.ReLU())
        self.up2 = nn.ConvTranspose2d(16, 8, 2, stride=2)
        self.dec2 = nn.Sequential(nn.Conv2d(8 + 16, 8, 3, padding=1), nn.ReLU())
        self.out_conv = nn.Conv2d(8, out_ch, 1)

    def forward(self, x, t): # x: [B, C, H, W], t: [B] (int)
        # --- time embedding ---
        temb = timestep_embedding(t, dim=128)   # [B,128]
        temb = self.time_mlp(temb)              # [B,32]
        # --- Encoder ---
        x1 = self.enc1(x) + self.time_proj_enc1(temb)[:, :, None, None]                      # [B, 16, H, W]
        x2 = F.max_pool2d(x1, 2)               # [B, 16, H/2, W/2]
        x3 = self.enc2(x2) + self.time_proj_enc2(temb)[:, :, None, None]                     # [B, 32, H/2, W/2]
        bottleneck = F.max_pool2d(x3, 2) + temb[:, :, None, None]      # [B, 32, H/4, W/4]
        # --- Decoder ---
        y = self.up1(bottleneck)                # [B, 16, H/2, W/2]
        y = torch.cat([y, x3], dim=1)           # [B, 16 + 32, H/2, W/2]
        y = self.dec1(y) + self.time_proj_dec1(temb)[:, :, None, None]                       # [B, 16, H/2, W/2]

        y = self.up2(y)                         # [B, 8, H, W]
        y = torch.cat([y, x1], dim=1)           # [B, 8 + 16, H, W]
        y = self.dec2(y) + self.time_proj_dec2(temb)[:, :, None, None]                       # [B, 8, H, W]

        return self.out_conv(y) # [B, out_ch, H, W]

Symbol	Shape	Meaning
$W_Q, W_K, W_V$	$C \times d$	learned linear projection matrices
$Q, K, V$	$N \times d$	query, key, and value vectors at each position

Symbol	Shape	Meaning
$QK^T$	$N \times N$	similarity between every pair of positions
$A$	$N \times N$	attention weights matrix, row-normalized

Symbol	Shape	Meaning
$V$	$N \times d$	value representation at each position
$A V$	$N \times d$	global features for each query after attention
$Y$	$N \times d$	self-attention output (reshape back to $C \times H \times W$)

Symbol	What it means in LDM U-Net	Intuitive meaning
Q (Query)	A current pixel (latent location) "asks" the entire latent feature map: how should I adjust my noise prediction?	Each position decides how to combine information from other positions
K (Key)	The feature signature of each pixel (latent location), representing its structure/semantics	Indicates to others "what kind of feature am I"
V (Value)	The feature values each pixel offers for sharing (content others may use)	What features others extract from me to update themselves

--- DDPM Architecture¶

Forward Process: From Data to Noise¶

Reverse Process: From Noise Back to Data¶

Generation Process of DDPM: Iterative Denoising¶

--- Math Behinde the Training Objective of DDPM¶

0. Notation and Setup¶

1. From Likelihood to ELBO¶

2. From ELBO to DDPM's L2 Loss¶

--- DDPM Implementation (Simple MLP as Noise Predictor)¶

--- U-Net As Noise Predictor (Instead of MLP)¶

What is U-Net?¶

Use U-Net in DDPM¶

Add residual block to U-Net¶

Add attention to U-Net¶

Example of Computing Attention¶

--- $\varepsilon$ / $x_0$ / $v$ Prediction¶

1. $\varepsilon$-prediction¶

2. $x_0$-prediction¶

3. $v$-prediction¶

--- Appendix¶