Custom loss functions.
%load_ext autoreload
%autoreload 2
%matplotlib inline
# Used for testing only.
from fastai2.metrics import LabelSmoothingCrossEntropy
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
from htools import assert_raises, InvalidArgumentError
For demonstration, each row of our soft labels tries to capture a different case:
- Row 0: High confidence label for one class, model has high confidence in the correct class.
- Row 1: Moderate confidence in 2 different classes, model has high confidence in the best class.
- Row 2: Confidence is split between a few classes, model predictions assign most probability to the two nonzero but non-ideal classes.
Row 2 should benefit slightly from label smoothing since it predicts answers that are not ideal but not entirely wrong.
# 3 row mini batch
y_label = torch.tensor([0, 1, 3])
y_ohe = torch.tensor([[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
[0, 0, 0, 1, 0]], dtype=torch.float)
y_smooth = torch.tensor([[.9, .1, 0, 0, 0],
[.35, .65, 0, 0, 0],
[0, .1, .2, .7, 0]])
logits = torch.tensor([[6, 1, 3, 1, 2],
[5, 8, 1, 1, 2],
[1, 3, 4, 2, 0]], dtype=torch.float)
y_hat = F.softmax(logits, dim=-1)
y_hat
# Test: smooth_soft_labels
assert torch.allclose(
smooth_soft_labels(y_smooth, .1),
torch.tensor([[0.8700, 0.0700, 0.0200, 0.0200, 0.0200],
[0.3200, 0.6200, 0.0200, 0.0200, 0.0200],
[0.0200, 0.0867, 0.1867, 0.6867, 0.0200]]),
atol=1e-3), 'Case 1: alpha=0.1'
assert torch.allclose(smooth_soft_labels(y_smooth, 0.0), y_smooth, atol=1e-3),\
'Case 2: alpha=0.0'
with assert_raises(InvalidArgumentError):
smooth_soft_labels(y_smooth, -.1)
modes = ('none', 'sum', 'mean')
for mode in modes:
j = soft_label_cross_entropy_with_logits(logits, y_ohe, reduction=mode)
print(f'reduction={mode}:', j)
print('soft_label_cross_entropy_with_logits:\n' + '-'*37)
for mode in modes:
j = soft_label_cross_entropy_with_logits(logits, y_smooth, reduction=mode)
print(f'reduction={mode}:', j)
print('\nsoft_label_cross_entropy:\n' + '-'*25)
for mode in modes:
j = soft_label_cross_entropy(y_hat, y_smooth, reduction=mode)
print(f'reduction={mode}:', j)
for mode in modes:
j = soft_label_cross_entropy_with_logits(logits, y_smooth, alpha=.2,
reduction=mode)
print(f'reduction={mode}:', j)
def plot_loss_by_alpha(y_pred, y_true, loss_func=None, loss_class=None):
alphas = np.arange(0, 1, .05)
if loss_class:
js = {a: loss_class(a, 'none')(y_pred, y_true) for a in alphas}
else:
js = {a: loss_func(y_pred, y_true, a, 'none') for a in alphas}
fig, ax = plt.subplots()
for i in range(3):
ax.plot(alphas, [x[i] for x in js.values()], label=i)
plt.xlabel('Alpha (Smoothing Parameter)')
plt.ylabel('Loss')
plt.legend()
plt.show()
Notice the gap between row 2 begins to narrow, as predicted.
plot_loss_by_alpha(y_hat, y_smooth, soft_label_cross_entropy)
plot_loss_by_alpha(logits, y_smooth, soft_label_cross_entropy_with_logits)
plot_loss_by_alpha(logits, y_ohe, soft_label_cross_entropy_with_logits)
For comparison, here is the fastai label smoothing loss. It should return the same results in this case (just adding uniform noise). The advantage of ours is the ability to specify a prior non-uniform distribution.
plot_loss_by_alpha(logits, y_label, loss_class=LabelSmoothingCrossEntropy)
def classification_mae_with_logits(y_pred, y_true, reduction='mean'):
"""Mean absolute error for classification using soft or one hot encoded
labels (can be helpful for dealing with noisy labels). Classification
models often forego the final softmax layer and use a loss function which
computes the log softmax for numerical stability reasons. This loss
function makes it easy to swap different loss functions in and out without
changing the model or the training loop.
Parameters
----------
y_pred: torch.FloatTensor
Logits. Shape (bs, num_classes).
y_true: torch.FloatTensor
Labels, either one hot encoded or soft. Shape (bs, num_classes).
reduction: str
Just like any PyTorch loss function, this determines how to aggregate
the loss over a mini batch. One of ('mean', 'sum', 'none').
"""
# Built-in reduction will aggregate over every value in the tensor because
# it expects that we're predicting a single value per sample. Because
# we're predicting a vector for each sample, we compute row-wise sums
# regardless of our choice of reduction, then aggregate over the batch
# dimension if specified.
rowwise_j = F.l1_loss(F.softmax(y_pred, dim=-1),
y_true, reduction='none').sum(-1)
if reduction == 'none': return rowwise_j
return getattr(rowwise_j, reduction)(dim=-1)
logits
F.softmax(logits, dim=-1)
y_ohe
for mode in modes:
j = classification_mae_with_logits(logits, y_ohe, mode)
print(mode, j)
bs = 2
x1 = torch.randn(bs, 5)
x2 = torch.randn(bs, 5)
y = torch.tensor([1, 0]).unsqueeze(-1)
# Make pair 1 similar, pair 2 dissimilar.
x1[0] += torch.arange(0, 100, 20)
x1[1] -= 50
x2[0] += torch.arange(0, 100, 20)
x2[1] += 25
print(x1)
print(x2)
loss = ContrastiveLoss1d(reduction='mean')
loss(x1, x2, y)
loss = ContrastiveLoss1d(reduction='sum')
loss(x1, x2, y)
loss = ContrastiveLoss1d(reduction='none')
loss(x1, x2, y)
x2
# x3 provides 3 examples for each source image in x2. Some are different
# and some are similar (notice where noise is amplified vs. dampened).
noise = torch.rand(bs, 3, 5) * 10
noise[0, [0, -1]] /= 100
noise[0, 1] *= 5
noise[-1, -1] += 500
x3 = x2[:, None, ...] + noise
print(x3.shape)
print(x3)
y2d = torch.tensor([[1, 0, 1],
[1, 1, 0]])
y2d
loss = ContrastiveLoss2d()
loss(x2, x3, y2d)
loss = ContrastiveLoss2d(reduction='sum')
loss(x2, x3, y2d)
loss = ContrastiveLoss2d(reduction='row')
loss(x2, x3, y2d)
loss = ContrastiveLoss2d(reduction='none')
loss(x2, x3, y2d)