import torch
import torch.nn as nn
import torch.nn.functional as F

class ToyUNet_ConditionalDDPM(nn.Module):
    def __init__(self, in_ch=3, num_classes=10):
        super().__init__()
        self.in_ch = in_ch
        out_ch = in_ch
        emb_dim = 128  # embedding dim for time & class
        # --- Time embedding ---
        self.time_mlp = nn.Sequential(
            nn.Linear(emb_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32)
        )
        # --- Class embedding ---
        self.class_emb = nn.Embedding(num_classes, emb_dim)
        # projection layers
        self.time_proj_enc1 = nn.Linear(32, 16)
        self.time_proj_enc2 = nn.Linear(32, 32)
        self.time_proj_dec1 = nn.Linear(32, 16)
        self.time_proj_dec2 = nn.Linear(32, 8)
        # --- Encoder ---
        self.enc1 = nn.Sequential(nn.Conv2d(in_ch, 16, 3, padding=1), nn.ReLU())
        self.enc2 = nn.Sequential(nn.Conv2d(16, 32, 3, padding=1), nn.ReLU())
        # --- Decoder ---
        self.up1 = nn.ConvTranspose2d(32, 16, 2, stride=2)
        self.dec1 = nn.Sequential(nn.Conv2d(16 + 32, 16, 3, padding=1), nn.ReLU())
        self.up2 = nn.ConvTranspose2d(16, 8, 2, stride=2)
        self.dec2 = nn.Sequential(nn.Conv2d(8 + 16, 8, 3, padding=1), nn.ReLU())
        self.out_conv = nn.Conv2d(8, out_ch, 1)

    def forward(self, x, t, y):
        # --- time embedding ---
        temb = timestep_embedding(t, dim=128)  # [B, 128]
        temb = self.time_mlp(temb)             # [B, 32]
        # --- class embedding ---
        cemb = self.class_emb(y)               # [B, 128]
        cemb = self.time_mlp(cemb)             # reuse same MLP to project
        temb = temb + cemb                     # merge condition with time (Improved DDPM style)
        # --- Encoder ---
        x1 = self.enc1(x) + self.time_proj_enc1(temb)[:, :, None, None]
        x2 = F.max_pool2d(x1, 2)
        x3 = self.enc2(x2) + self.time_proj_enc2(temb)[:, :, None, None]
        bottleneck = F.max_pool2d(x3, 2) + temb[:, :, None, None]
        # --- Decoder ---
        y = self.up1(bottleneck)
        y = torch.cat([y, x3], dim=1)
        y = self.dec1(y) + self.time_proj_dec1(temb)[:, :, None, None]

        y = self.up2(y)
        y = torch.cat([y, x1], dim=1)
        y = self.dec2(y) + self.time_proj_dec2(temb)[:, :, None, None]

        return self.out_conv(y)

# -----------
# Toy Example (Not Runnable)
# -----------

import torch
import torch.nn.functional as F

x_t.requires_grad_(True)                     # shape [B, Channels * Height * Width]
logits = classifier(x_t)                     # shape [B, C]. Classifier returns logits, not log probabilities.
log_probs = F.log_softmax(logits, dim=1)     # shape [B, C]. Apply softmax for every row: logits -> log probabilities.

# y is the label index which provided with the data x. For a batch, y is a tensor of shape [B]. Example: y = [0, 2, 5, ..., 4]
# NOTICE that we CANNOT use log_probs[:, y]; for example, if y = [0, 2, 5], log_probs[:, y] will select columns 0, 2, and 5 from every row, returning a tensor of shape [3, 3]
selected = log_probs[torch.arange(len(y)), y]  # shape [B]. This is log p(y|x_t). 

# Compute ∇_{x_t} log p(y|x_t)
# torch.autograd.grad() returns a TUPLE, the length of which equals the number of inputs you pass in. For example:
# grad_tuple = torch.autograd.grad(outputs, inputs=(x_t, w, b)) -> (grad_x_t, grad_w, grad_b)
#
# NOTICE that we ACTUALLY want to compute the gradient of selected[i] with respect to x_t[i].
# We use selected.sum() because the gradient of the sum is the same as the gradient of selected[i] wrt x_t[i].
grad_tuple = torch.autograd.grad(selected.sum(), x_t)  # -> (grad_x_t,) grad_x_t has the same shape as x_t
grad = grad_tuple[0]                    

# Combine them (classifier guidance)
eps_guided = eps_pred - s * grad

import torch
import random

## training
if random.random() < p_uncond:
    cond = None      # drop condition → unconditional
else:
    cond = cond         # keep condition → conditional

pred = model(x_t, t, cond)


## inference
def classifier_free_guidance(model, x_t, t, cond, scale):
    eps_cond = model(x_t, t, cond)
    eps_uncond = model(x_t, t, torch.zeros_like(cond))  # drop condition
    eps_guided = eps_uncond + scale * (eps_cond - eps_uncond)
    return eps_guided

Stage	Representative Model	Core Idea	Conditional Generation
1️⃣ DDPM (Basic Diffusion Model)	Ho et al., “Denoising Diffusion Probabilistic Models” (NeurIPS 2020)	Simulates the process of gradually adding and then removing noise, learning the noise distribution.	❌ Unconditional (only learns to generate images)
2️⃣ Conditional Diffusion	Dhariwal & Nichol, “Improved DDPM” (2021), etc.	Adds conditioning (e.g., labels, embeddings, or images) to the denoising prediction.	✅ Conditional
3️⃣ Classifier Guidance	Dhariwal & Nichol, “Improved DDPM” (2021), etc.	Uses the gradient from an external classifier to guide the diffusion direction, enabling conditional control.	✅ Conditional (via external classifier)
4️⃣ Classifier-Free Guidance (CFG)	Ho & Salimans (2022)	Learn both conditional and unconditional modes in one model and mix them at inference.	✅ Conditional (internally implemented)
5️⃣ LDM (Latent Diffusion Model)	Rombach et al., CVPR 2022	Trains the diffusion process in the VAE latent space, conditioned on text embeddings (CLIP/Text Encoder).	✅ Conditional (multimodal embedding)

--- Conditional Diffusion¶

Architecture¶

Toy Example Implementation (Additive Conditioning)¶

--- Classifier Guidance¶

Architecture¶

Implementation Highlights¶

Why Classifier Guidance Works¶

Math¶

In one sentence¶

--- Classifier-Free Guidance (CFG)¶

Architecture¶

Implementation Highlights¶

Why CFG Works¶