Let’s continue from our multivariate linear regression. Now let’s incorporate the regularization () into our model.
Purpose of this Notebook:
The purposes of this notebook are:
Incorporate regularization into our Perceptron from scratch
Train our Perceptron
Compare our Perceptron to the one prebuilt by PyTorch
import torch
from torch import nn
from platform import python_version
python_version(), torch.__version__('3.12.12', '2.9.0+cu128')device = 'cpu'
if torch.cuda.is_available():
device = 'cuda'
device'cpu'torch.set_default_dtype(torch.float64)def add_to_class(Class):
"""Register functions as methods in created class."""
def wrapper(obj): setattr(Class, obj.__name__, obj)
return wrapperDataset¶
create dataset¶
from sklearn.datasets import make_regression
import random
M: int = 10_100 # number of samples
N: int = 6 # number of input features
NO: int = 3 # number of output features
X, Y = make_regression(
n_samples=M,
n_features=N,
n_targets=NO,
n_informative=N - 1,
bias=random.random(),
noise=1
)
print(X.shape)
print(Y.shape)(10100, 6)
(10100, 3)
split dataset¶
X_train = torch.tensor(X[:100], device=device)
Y_train = torch.tensor(Y[:100], device=device)
X_train.shape, Y_train.shape(torch.Size([100, 6]), torch.Size([100, 3]))X_valid = torch.tensor(X[100:], device=device)
Y_valid = torch.tensor(Y[100:], device=device)
X_valid.shape, Y_valid.shape(torch.Size([10000, 6]), torch.Size([10000, 3]))delete raw dataset¶
del X
del YScratch model¶
The only thing we are going to modify is the way in which the model weights are updated. The rest, such as parameter initialization and model training, remain unchanged.
Linear Regression model¶
class LinearRegression:
def __init__(self, n_features: int, out_features: int, lambd: float):
self.w = torch.randn(n_features, out_features, device=device)
self.b = torch.randn(out_features, device=device)
self.lambd = lambd
def copy_params(self, torch_layer: torch.nn.modules.linear.Linear):
"""
Copy the parameters from a module.linear to this model.
Args:
torch_layer: Pytorch module from which to copy the parameters.
"""
self.b.copy_(torch_layer.bias.detach().clone())
self.w.copy_(torch_layer.weight.T.detach().clone())
def predict(self, x: torch.Tensor) -> torch.Tensor:
"""
Predict the output for input x
Args:
x: Input tensor of shape (n_samples, n_features).
Returns:
y_pred: Predicted output tensor of shape (n_samples, out_features).
"""
return torch.matmul(x, self.w) + self.b
def mse_loss(self, y_true: torch.Tensor, y_pred: torch.Tensor):
"""
MSE loss function between target y_true and y_pred.
Args:
y_true: Target tensor of shape (n_samples, out_features).
y_pred: Predicted tensor of shape (n_samples, out_features).
Returns:
loss: MSE loss between predictions and true values.
"""
return ((y_pred - y_true)**2).mean().item()
def evaluate(self, x: torch.Tensor, y_true: torch.Tensor):
"""
Evaluate the model on input x and target y_true using MSE.
Args:
x: Input tensor of shape (n_samples, n_features).
y_true: Target tensor of shape (n_samples, out_features).
Returns:
loss: MSE loss between predictions and true values.
"""
y_pred = self.predict(x)
return self.mse_loss(y_true, y_pred)
def fit(self, x_train: torch.Tensor, y_train: torch.Tensor,
epochs: int, lr: float, batch_size: int,
x_valid: torch.Tensor, y_valid: torch.Tensor):
"""
Fit the model using gradient descent.
Args:
x_train: Input tensor of shape (n_samples, n_features).
y_train: Target tensor of shape (n_samples,).
epochs: Number of epochs to fit.
lr: learning rate.
batch_size: Int number of batch.
x_valid: Input tensor of shape (n_valid_samples, n_features).
y_valid: Target tensor of shape (n_valid_samples,)
"""
for epoch in range(epochs):
loss = []
for batch in range(0, len(y_train), batch_size):
end_batch = batch + batch_size
y_pred = self.predict(x_train[batch:end_batch])
loss.append(self.mse_loss(
y_train[batch:end_batch],
y_pred
))
self.update(
x_train[batch:end_batch],
y_train[batch:end_batch],
y_pred,
lr
)
loss = round(sum(loss) / len(loss), 4)
loss_v = round(self.evaluate(x_valid, y_valid), 4)
print(f'epoch: {epoch} - MSE: {loss} - MSE_v: {loss_v}')Parameters update¶
objective function¶
Now instead of training the model with the gradient of the loss function, we are going to use the objective function . Typically our objective function is as follows.
where is an arbitrary parameter.
Note: Do not use the objective function to evaluate the model.
L2 regularization¶
As a weight decay technique, we will use regularization, commonly or .
where commonly .
Note: is called as a hyperparameter, because it is a parameter set by the developer (you) not by the model.
But we have , then we need to do an equivalence operation.
where is element-wise power or also .
objective function derivative¶
Because
@add_to_class(LinearRegression)
def update(self, x: torch.Tensor, y_true: torch.Tensor, y_pred: torch.Tensor, lr: float):
"""
Update the model parameters with L2 regularization.
Args:
x: Input tensor of shape (n_samples, n_features).
y_true: Target tensor of shape (n_samples, n_features).
y_pred: Predicted output tensor of shape (n_samples, n_features).
lr: Learning rate.
"""
delta = 2 * (y_pred - y_true) / y_true.numel()
self.b -= lr * delta.sum(axis=0)
self.w -= lr * (torch.matmul(x.T, delta) + self.lambd * self.w) # L2 regularizationScratch vs Torch.nn¶
Torch.nn model¶
class TorchLinearRegression(nn.Module):
def __init__(self, n_features, n_out_features):
super(TorchLinearRegression, self).__init__()
self.layer = nn.Linear(n_features, n_out_features, device=device)
self.loss = nn.MSELoss()
def forward(self, x):
return self.layer(x)
def evaluate(self, x, y):
self.eval()
with torch.no_grad():
y_pred = self.forward(x)
return self.loss(y_pred, y).item()
def fit(self, x, y, epochs, lr, batch_size, x_valid, y_valid, weight_decay):
optimizer = torch.optim.SGD([
{'params': self.layer.weight, 'weight_decay': weight_decay},
{'params': self.layer.bias} # it is important to specify the weight decay for the bias.
], lr=lr)
for epoch in range(epochs):
loss_t = []
for batch in range(0, len(y), batch_size):
end_batch = batch + batch_size
y_pred = self.forward(x[batch:end_batch])
loss = self.loss(y_pred, y[batch:end_batch])
loss_t.append(loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_t = round(sum(loss_t) / len(loss_t), 4)
loss_v = round(self.evaluate(x_valid, y_valid), 4)
print(f'epoch: {epoch} - MSE: {loss_t} - MSE_v: {loss_v}')torch_model = TorchLinearRegression(N, NO)scratch model¶
LAMBD: float = 0.01
model = LinearRegression(N, NO, LAMBD)
model.lambd0.01evals¶
import MAPE modified¶
# This cell imports torch_mape
# if you are running this notebook locally
# or from Google Colab.
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
sys.path.append(module_path)
try:
from tools.torch_metrics import torch_mape as mape
print('mape imported locally.')
except ModuleNotFoundError:
import subprocess
repo_url = 'https://raw.githubusercontent.com/PilotLeoYan/inside-deep-learning/main/content/tools/torch_metrics.py'
local_file = 'torch_metrics.py'
subprocess.run(['wget', repo_url, '-O', local_file], check=True)
try:
from torch_metrics import torch_mape as mape # type: ignore
print('mape imported from GitHub.')
except Exception as e:
print(e)mape imported locally.
predict¶
mape(
model.predict(X_valid),
torch_model.forward(X_valid)
)4536.602432801copy parameters¶
model.copy_params(torch_model.layer)
parameters = (model.b.clone(), model.w.clone())predict after copy parameters¶
mape(
model.predict(X_valid),
torch_model.forward(X_valid)
)0.0loss¶
mape(
model.evaluate(X_valid, Y_valid),
torch_model.evaluate(X_valid, Y_valid)
)0.0train¶
LR = 0.01 # learning rate
EPOCHS = 16 # number of epochs
BATCH = len(X_train) // 3 # batch sizetorch_model.fit(
X_train, Y_train,
EPOCHS, LR, BATCH,
X_valid, Y_valid,
LAMBD
)epoch: 0 - MSE: 9379.6668 - MSE_v: 11418.2368
epoch: 1 - MSE: 8959.7574 - MSE_v: 10942.8714
epoch: 2 - MSE: 8563.3423 - MSE_v: 10488.7917
epoch: 3 - MSE: 8188.2793 - MSE_v: 10054.8382
epoch: 4 - MSE: 7832.7495 - MSE_v: 9639.9506
epoch: 5 - MSE: 7495.1956 - MSE_v: 9243.1538
epoch: 6 - MSE: 7174.2734 - MSE_v: 8863.5466
epoch: 7 - MSE: 6868.8127 - MSE_v: 8500.2915
epoch: 8 - MSE: 6577.7858 - MSE_v: 8152.6074
epoch: 9 - MSE: 6300.2827 - MSE_v: 7819.7632
epoch: 10 - MSE: 6035.4912 - MSE_v: 7501.0722
epoch: 11 - MSE: 5782.6804 - MSE_v: 7195.8875
epoch: 12 - MSE: 5541.1882 - MSE_v: 6903.5987
epoch: 13 - MSE: 5310.4102 - MSE_v: 6623.6283
epoch: 14 - MSE: 5089.7919 - MSE_v: 6355.429
epoch: 15 - MSE: 4878.8213 - MSE_v: 6098.4816
model.fit(
X_train, Y_train,
EPOCHS, LR, BATCH,
X_valid, Y_valid
)epoch: 0 - MSE: 9379.6668 - MSE_v: 11418.2368
epoch: 1 - MSE: 8959.7574 - MSE_v: 10942.8714
epoch: 2 - MSE: 8563.3423 - MSE_v: 10488.7917
epoch: 3 - MSE: 8188.2793 - MSE_v: 10054.8382
epoch: 4 - MSE: 7832.7495 - MSE_v: 9639.9506
epoch: 5 - MSE: 7495.1956 - MSE_v: 9243.1538
epoch: 6 - MSE: 7174.2734 - MSE_v: 8863.5466
epoch: 7 - MSE: 6868.8127 - MSE_v: 8500.2915
epoch: 8 - MSE: 6577.7858 - MSE_v: 8152.6074
epoch: 9 - MSE: 6300.2827 - MSE_v: 7819.7632
epoch: 10 - MSE: 6035.4912 - MSE_v: 7501.0722
epoch: 11 - MSE: 5782.6804 - MSE_v: 7195.8875
epoch: 12 - MSE: 5541.1882 - MSE_v: 6903.5987
epoch: 13 - MSE: 5310.4102 - MSE_v: 6623.6283
epoch: 14 - MSE: 5089.7919 - MSE_v: 6355.429
epoch: 15 - MSE: 4878.8213 - MSE_v: 6098.4816
predict after training¶
mape(
model.predict(X_valid),
torch_model.forward(X_valid)
)7.350389485622577e-14weight¶
mape(
model.w.clone(),
torch_model.layer.weight.detach().T
)1.1728863346159968e-14bias¶
mape(
model.b.clone(),
torch_model.layer.bias.detach()
)4.447106248397419e-14
