import numpy as np

# Define the RNN parameters
input_size = 2
hidden_size = 2
output_size = 1
sequence_length = 3
learning_rate = 0.1

# Initialize RNN weights and biases
Wxh = np.random.randn(hidden_size, input_size)  # Weight matrix for input to hidden
Whh = np.random.randn(hidden_size, hidden_size) # Weight matrix for hidden to hidden
Why = np.random.randn(output_size, hidden_size) # Weight matrix for hidden to output
bh = np.zeros((hidden_size, 1))                 # Bias for hidden
by = np.zeros((output_size, 1))                 # Bias for output

# Define the input sequence and target output
input_sequence = np.array([[0, 1], [1, 0], [1, 1]])  # Example input sequence
target_output = np.array([[1], [1], [0]])            # Example target output

# Forward pass
hs = np.zeros((sequence_length, hidden_size, 1))  # Hidden states
ys = np.zeros((sequence_length, output_size, 1))  # Outputs

for t in range(sequence_length):
    # Input at time step t
    xt = input_sequence[t].reshape(-1, 1)
    
    # Compute hidden state at time step t
    hs[t] = np.tanh(np.dot(Wxh, xt) + np.dot(Whh, hs[t-1]) + bh)
    
    # Compute output at time step t
    ys[t] = np.dot(Why, hs[t]) + by

# Compute loss
loss = np.mean((ys - target_output)**2)

# Backward pass (backpropagation through time)
dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
dbh, dby = np.zeros_like(bh), np.zeros_like(by)
dh_next = np.zeros_like(hs[0])

for t in reversed(range(sequence_length)):
    dy = ys[t] - target_output[t]
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dh_next
    dhraw = (1 - hs[t] ** 2) * dh
    dbh += dhraw
    dWxh += np.dot(dhraw, input_sequence[t].reshape(1, -1))
    dWhh += np.dot(dhraw, hs[t-1].T)
    dh_next = np.dot(Whh.T, dhraw)

# Update weights and biases
Wxh -= learning_rate * dWxh
Whh -= learning_rate * dWhh
Why -= learning_rate * dWhy
bh -= learning_rate * dbh
by -= learning_rate * dby

# Print loss
print("Loss:", loss)

Loss: 1.8370785003716503

import torch
import torch.nn as nn

# ----- Dummy data -----
x = torch.tensor([[0, 1], [1, 0], [1, 1]], dtype=torch.float32).unsqueeze(1)  # (T, batch, input_size)
y = torch.tensor([[1], [1], [0]], dtype=torch.float32).unsqueeze(1)            # (T, batch, output_size)

# ----- Architecture -----
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size): # hidden_size is the dimension of the hidden state
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=False, nonlinearity="tanh")
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # x: (T, batch, input_size)
        h0 = torch.zeros(1, x.size(1), hidden_size)  # hidden state
        out, hn = self.rnn(x, h0)                    # out: (T, batch, hidden_size). contains h1, h2, ..., hT at the top hiddenlayer 
        y_pred = self.fc(out)                        # (T, batch, output_size)
        return y_pred

model = SimpleRNN(input_size, hidden_size, output_size)

# ----- Loss & Optimizer -----
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# ----- Forward + Backward -----
y_pred = model(x)
loss = criterion(y_pred, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()

print("Loss:", loss.item())

Loss: 1.767337441444397

import tensorflow as tf
from tensorflow.keras import layers, models

# ----- Dummy data -----
x = tf.constant([[[0, 1], [1, 0], [1, 1]]], dtype=tf.float32)
y = tf.constant([[[1], [1], [0]]], dtype=tf.float32)

# ----- Architecture -----
inputs = layers.Input(shape=(sequence_length, input_size))
rnn_out = layers.SimpleRNN(hidden_size,
                           activation="tanh", 
                           return_sequences=True)(inputs)  # shape: (batch_size, sequence_length, hidden_size)
outputs = layers.TimeDistributed(layers.Dense(output_size))(rnn_out)  # shape: (batch_size, sequence_length, output_size)

model = models.Model(inputs, outputs)

# ----- Compile -----
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate),
              loss="mse")

# ----- Train one step -----
loss = model.train_on_batch(x, y)
print("Loss:", loss)

Loss: 0.57874405

import numpy as np

# Define parameters
input_size = 2
hidden_size = 2
output_size = 1
sequence_length = 3
num_layers = 2
learning_rate = 0.1
device = "cpu"

# Initialize weights and biases for each layer
# Dimensions: (hidden_size, input_size) for the first layer, (hidden_size, hidden_size) for subsequent layers
Wxh = [np.random.randn(hidden_size, input_size if l == 0 else hidden_size) for l in range(num_layers)]
# Dimensions: (hidden_size, hidden_size) for all layers
Whh = [np.random.randn(hidden_size, hidden_size) for _ in range(num_layers)]
# Dimensions: (hidden_size, 1) for all layers
bh = [np.zeros((hidden_size, 1)) for _ in range(num_layers)]

# Output layer weights (from top RNN layer)
Why = np.random.randn(output_size, hidden_size)
by = np.zeros((output_size, 1))

# Input and target
input_sequence = np.array([[0, 1], [1, 0], [1, 1]])
target_output = np.array([[1], [1], [0]])

# Forward pass
hs = np.zeros((sequence_length, num_layers, hidden_size, 1))  # hidden states per layer
ys = np.zeros((sequence_length, output_size, 1))              # output per time step

for t in range(sequence_length):
    x = input_sequence[t].reshape(-1, 1)
    for l in range(num_layers):
        prev_h = hs[t-1, l] if t > 0 else np.zeros((hidden_size, 1))
        input_to_layer = x if l == 0 else hs[t, l-1]
        hs[t, l] = np.tanh(np.dot(Wxh[l], input_to_layer) + np.dot(Whh[l], prev_h) + bh[l])
    ys[t] = np.dot(Why, hs[t, -1]) + by  # Output from last RNN layer

# Loss
loss = np.mean((ys - target_output)**2)

# Backward pass (BPTT for 2-layer RNN)
dWxh = [np.zeros_like(W) for W in Wxh]
dWhh = [np.zeros_like(W) for W in Whh]
dbh = [np.zeros_like(b) for b in bh]
dWhy = np.zeros_like(Why)
dby = np.zeros_like(by)
dh_next = [np.zeros((hidden_size, 1)) for _ in range(num_layers)]

for t in reversed(range(sequence_length)):
    dy = ys[t] - target_output[t]
    dWhy += np.dot(dy, hs[t, -1].T)
    dby += dy

    dh = [np.zeros((hidden_size, 1)) for _ in range(num_layers)]
    dh[-1] = np.dot(Why.T, dy) + dh_next[-1]

    for l in reversed(range(num_layers)):
        h = hs[t, l]
        dhraw = (1 - h ** 2) * dh[l]
        dbh[l] += dhraw
        input_to_layer = input_sequence[t].reshape(-1, 1) if l == 0 else hs[t, l-1]
        prev_h = hs[t-1, l] if t > 0 else np.zeros_like(h)

        dWxh[l] += np.dot(dhraw, input_to_layer.T)
        dWhh[l] += np.dot(dhraw, prev_h.T)

        if l > 0:
            dh[l-1] = np.dot(Wxh[l].T, dhraw) + dh_next[l-1]
        dh_next[l] = np.dot(Whh[l].T, dhraw)

# Update weights and biases
for l in range(num_layers):
    Wxh[l] -= learning_rate * dWxh[l]
    Whh[l] -= learning_rate * dWhh[l]
    bh[l] -= learning_rate * dbh[l]

Why -= learning_rate * dWhy
by -= learning_rate * dby

print("Loss:", loss)

Loss: 4.053545353348784

import torch
import torch.nn as nn

# ----- Data -----
# x: (T, batch, input_size), y: (T, batch, output_size)
x = torch.tensor([[0., 1.],
                  [1., 0.],
                  [1., 1.]], dtype=torch.float32).unsqueeze(1)   # (3, 1, 2)
y = torch.tensor([[1.],
                  [1.],
                  [0.]], dtype=torch.float32).unsqueeze(1)       # (3, 1, 1)

# ----- Model -----
class StackedRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super().__init__()
        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            nonlinearity="tanh",
            batch_first=False,     # (T, B, H)
        )
        self.fc = nn.Linear(hidden_size, output_size)  # Top layer readout

    def forward(self, x):
        # x: (T, B, input_size)
        h0 = torch.zeros(num_layers, x.size(1), hidden_size, device=x.device)
        out_all, _ = self.rnn(x, h0)                  # (T, B, H) contains h1, h2, ..., hT at the top hiddenlayer 
        y_pred = self.fc(out_all)                     # (T, B, output_size)
        return y_pred

model = StackedRNN(input_size, hidden_size, output_size, num_layers).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# ----- One training step -----
model.train()
y_pred = model(x)
loss = criterion(y_pred, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()

print("PyTorch Loss:", loss.item())

PyTorch Loss: 0.44763097167015076

import tensorflow as tf
from tensorflow.keras import layers, models

# ----- Data -----
# Shape: (batch, T, input_size) and (batch, T, output_size)
x = tf.constant([[[0., 1.],
                  [1., 0.],
                  [1., 1.]]], dtype=tf.float32)   # (1, 3, 2)
y = tf.constant([[[1.],
                  [1.],
                  [0.]]], dtype=tf.float32)       # (1, 3, 1)

# ----- Model -----
inputs = layers.Input(shape=(sequence_length, input_size))
rnn = layers.SimpleRNN(hidden_size, activation="tanh", return_sequences=True)(inputs)  # (batch, T, hidden_size)
for _ in range(num_layers - 1):
    rnn = layers.SimpleRNN(hidden_size, activation="tanh", return_sequences=True)(rnn)  # (batch, T, hidden_size)
outputs = layers.TimeDistributed(layers.Dense(output_size))(rnn)  # (batch, T, output_size)

model = models.Model(inputs, outputs)
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate),
              loss="mse")

# ----- One training step -----
loss = model.train_on_batch(x, y)
print("TensorFlow Loss:", float(loss))

TensorFlow Loss: 1.0210273265838623

--- Single Layer RNN Architecture¶

Single Layer RNN Implementation¶

--- Multi Layer RNN Architecture¶

Multi Layer RNN Implementation¶

--- RNN Backpropagation¶