import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

# Reproducibility
torch.manual_seed(0)

<torch._C.Generator at 0x1275da110>

# --- Simulated dataset ---
num_samples = 1000
input_dim = 4
X_full = torch.randn(num_samples, input_dim)
true_W = torch.tensor([[1.0, -2.0, 0.5, 1.5]])
true_b = torch.tensor([0.2])
logits = X_full @ true_W.T + true_b
y_full = (torch.sigmoid(logits) > 0.5).float()  # Binary labels
print("Label distribution:", torch.bincount(y_full.view(-1).long()))

Label distribution: tensor([472, 528])

# --- Network architecture ---
hidden1_dim = 3
hidden2_dim = 2
output_dim = 1

# --- Hyperparameters ---
lr = 0.0001
batch_size = 32
epochs = 20

# --- Initialize weights (manual, no autograd) ---
def init_weights(shape):
    return torch.randn(shape) * 0.1

W1 = init_weights((hidden1_dim, input_dim))
b1 = torch.zeros(hidden1_dim)

W2 = init_weights((hidden2_dim, hidden1_dim))
b2 = torch.zeros(hidden2_dim)

W3 = init_weights((output_dim, hidden2_dim))
b3 = torch.zeros(1)

loss_history = []

# --- Training loop (mini-batch SGD) ---
for epoch in range(epochs):
    perm = torch.randperm(num_samples)
    total_loss = 0

    for i in range(0, num_samples, batch_size):
        idx = perm[i:i+batch_size]
        X = X_full[idx]
        y = y_full[idx]

        m = X.shape[0]  # actual batch size (may be < batch_size at the end)

        # --- Forward ---
        Z1 = X @ W1.T + b1
        A1 = F.relu(Z1)

        Z2 = A1 @ W2.T + b2
        A2 = F.relu(Z2)

        Z3 = A2 @ W3.T + b3
        y_pred = torch.sigmoid(Z3)

        # --- Loss ---
        eps = 1e-8
        loss = -(1/m) * torch.sum(y * torch.log(y_pred + eps) + (1 - y) * torch.log(1 - y_pred + eps))
        total_loss += loss.item()

        # --- Manual Backpropagation ---
        delta3 = (1/m) * (y_pred - y)              # (m, 1)
        dW3 = delta3.T @ A2                        # (1, 2)
        db3 = torch.sum(delta3, dim=0)             # (1,)

        dA2 = delta3 @ W3                          # (m, 2)
        dZ2 = dA2 * (Z2 > 0).float()               # (m, 2)
        dW2 = dZ2.T @ A1                           # (2, 3)
        db2 = torch.sum(dZ2, dim=0)                # (2,)

        dA1 = dZ2 @ W2                             # (m, 3)
        dZ1 = dA1 * (Z1 > 0).float()               # (m, 3)
        dW1 = dZ1.T @ X                            # (3, 4)
        db1 = torch.sum(dZ1, dim=0)                # (3,)

        # --- Gradient Descent ---
        W3 -= lr * dW3
        b3 -= lr * db3

        W2 -= lr * dW2
        b2 -= lr * db2

        W1 -= lr * dW1
        b1 -= lr * db1

    # Record and print loss per epoch
    avg_loss = total_loss / (num_samples / batch_size)
    loss_history.append(avg_loss)
    print(f"Epoch {epoch+1}/{epochs} | Avg Loss: {avg_loss:.4f}")

Epoch 1/20 | Avg Loss: 0.7098
Epoch 2/20 | Avg Loss: 0.7098
Epoch 3/20 | Avg Loss: 0.7098
Epoch 4/20 | Avg Loss: 0.7098
Epoch 5/20 | Avg Loss: 0.7098
Epoch 6/20 | Avg Loss: 0.7098
Epoch 7/20 | Avg Loss: 0.7098
Epoch 8/20 | Avg Loss: 0.7098
Epoch 9/20 | Avg Loss: 0.7098
Epoch 10/20 | Avg Loss: 0.7098
Epoch 11/20 | Avg Loss: 0.7098
Epoch 12/20 | Avg Loss: 0.7098
Epoch 13/20 | Avg Loss: 0.7097
Epoch 14/20 | Avg Loss: 0.7097
Epoch 15/20 | Avg Loss: 0.7098
Epoch 16/20 | Avg Loss: 0.7097
Epoch 17/20 | Avg Loss: 0.7097
Epoch 18/20 | Avg Loss: 0.7097
Epoch 19/20 | Avg Loss: 0.7097
Epoch 20/20 | Avg Loss: 0.7097

# --- Dataset and Dataloader ---
batch_size = 32
dataset = TensorDataset(X_full, y_full)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# --- Model ---
class SimpleNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(4, 3)
        self.fc2 = nn.Linear(3, 2)
        self.out = nn.Linear(2, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.out(x))
        return x

model = SimpleNet()

# --- Loss and Optimizer ---
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# --- Training Loop ---
epochs = 20
loss_history = []

for epoch in range(epochs):
    total_loss = 0
    for X_batch, y_batch in loader:
        y_pred = model(X_batch)

        loss = criterion(y_pred, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    loss_history.append(avg_loss)
    print(f"Epoch {epoch+1}/{epochs} | Avg Loss: {avg_loss:.4f}")

Epoch 1/20 | Avg Loss: 0.7113
Epoch 2/20 | Avg Loss: 0.7044
Epoch 3/20 | Avg Loss: 0.6970
Epoch 4/20 | Avg Loss: 0.6895
Epoch 5/20 | Avg Loss: 0.6833
Epoch 6/20 | Avg Loss: 0.6776
Epoch 7/20 | Avg Loss: 0.6733
Epoch 8/20 | Avg Loss: 0.6680
Epoch 9/20 | Avg Loss: 0.6636
Epoch 10/20 | Avg Loss: 0.6596
Epoch 11/20 | Avg Loss: 0.6534
Epoch 12/20 | Avg Loss: 0.6478
Epoch 13/20 | Avg Loss: 0.6424
Epoch 14/20 | Avg Loss: 0.6362
Epoch 15/20 | Avg Loss: 0.6293
Epoch 16/20 | Avg Loss: 0.6217
Epoch 17/20 | Avg Loss: 0.6125
Epoch 18/20 | Avg Loss: 0.6034
Epoch 19/20 | Avg Loss: 0.5911
Epoch 20/20 | Avg Loss: 0.5806

# --- Define model ---
model = tf.keras.Sequential([
    tf.keras.layers.Dense(3, activation='relu', input_shape=(4,)),
    tf.keras.layers.Dense(2, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=tf.keras.optimizers.SGD(learning_rate=0.01),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# --- Training ---
history = model.fit(
    X_full, y_full,
    batch_size=32,
    epochs=20,
    verbose=1
)

Epoch 1/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 697us/step - accuracy: 0.5020 - loss: 0.7126
Epoch 2/20

/Users/yufyi/miniconda3/envs/Cookbook/lib/python3.10/site-packages/keras/src/layers/core/dense.py:92: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)

32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 657us/step - accuracy: 0.5280 - loss: 0.7089
Epoch 3/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 660us/step - accuracy: 0.5280 - loss: 0.7058
Epoch 4/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 571us/step - accuracy: 0.5280 - loss: 0.7029
Epoch 5/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 550us/step - accuracy: 0.5280 - loss: 0.7003
Epoch 6/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 567us/step - accuracy: 0.5280 - loss: 0.6979
Epoch 7/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 542us/step - accuracy: 0.5280 - loss: 0.6959
Epoch 8/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 562us/step - accuracy: 0.5280 - loss: 0.6941
Epoch 9/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 549us/step - accuracy: 0.5280 - loss: 0.6930
Epoch 10/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 553us/step - accuracy: 0.5280 - loss: 0.6925
Epoch 11/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 559us/step - accuracy: 0.5280 - loss: 0.6923
Epoch 12/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 555us/step - accuracy: 0.5280 - loss: 0.6922
Epoch 13/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 561us/step - accuracy: 0.5280 - loss: 0.6921
Epoch 14/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 549us/step - accuracy: 0.5280 - loss: 0.6920
Epoch 15/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 531us/step - accuracy: 0.5280 - loss: 0.6919
Epoch 16/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 552us/step - accuracy: 0.5280 - loss: 0.6919
Epoch 17/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 543us/step - accuracy: 0.5280 - loss: 0.6918
Epoch 18/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 534us/step - accuracy: 0.5280 - loss: 0.6918
Epoch 19/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 540us/step - accuracy: 0.5280 - loss: 0.6918
Epoch 20/20
32/32 ━━━━━━━━━━━━━━━━━━━━ 0s 564us/step - accuracy: 0.5280 - loss: 0.6918

Input (dim = 4) ───► Linear Layer ───► Output (dim = 3)

Implementing a Three-Layer Feedforward Neural Network¶

1. Manual Implementation¶

2. Pytorch Version¶

3. TensorFlow Version¶

Perceptron¶

Activation¶