/Users/hmamin/incendio/incendio/callbacks.py:26: UserWarning: Accio not available.
  warnings.warn('Accio not available.')

%load_ext autoreload
%autoreload 2
%matplotlib inline

/Users/hmamin/anaconda3/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
  and should_run_async(code)

# Used for testing only.
from collections import defaultdict, Counter
from itertools import chain
import matplotlib.pyplot as plt
import pandas as pd
from torch.utils.data import Dataset, DataLoader

from htools import assert_raises, InvalidArgumentError, smap
from incendio.data import probabilistic_hash_item
import pandas_htools

/Users/hmamin/anaconda3/lib/python3.7/site-packages/pandas_flavor/register.py:31: UserWarning: registration of accessor <class 'pandas_flavor.register.register_dataframe_method.<locals>.inner.<locals>.AccessorMethod'> under name 'ends' for type <class 'pandas.core.frame.DataFrame'> is overriding a preexisting attribute with the same name.
  register_dataframe_accessor(method.__name__)(AccessorMethod)
/Users/hmamin/anaconda3/lib/python3.7/site-packages/pandas_flavor/register.py:53: UserWarning: registration of accessor <class 'pandas_flavor.register.register_series_method.<locals>.inner.<locals>.AccessorMethod'> under name 'ends' for type <class 'pandas.core.series.Series'> is overriding a preexisting attribute with the same name.
  register_series_accessor(method.__name__)(AccessorMethod)
/Users/hmamin/anaconda3/lib/python3.7/site-packages/pandas_flavor/register.py:31: UserWarning: registration of accessor <class 'pandas_flavor.register.register_dataframe_method.<locals>.inner.<locals>.AccessorMethod'> under name 'filter_by_count' for type <class 'pandas.core.frame.DataFrame'> is overriding a preexisting attribute with the same name.
  register_dataframe_accessor(method.__name__)(AccessorMethod)
/Users/hmamin/anaconda3/lib/python3.7/site-packages/pandas_flavor/register.py:31: UserWarning: registration of accessor <class 'pandas_flavor.register.register_dataframe_method.<locals>.inner.<locals>.AccessorMethod'> under name 'grouped_mode' for type <class 'pandas.core.frame.DataFrame'> is overriding a preexisting attribute with the same name.
  register_dataframe_accessor(method.__name__)(AccessorMethod)
/Users/hmamin/anaconda3/lib/python3.7/site-packages/pandas_flavor/register.py:31: UserWarning: registration of accessor <class 'pandas_flavor.register.register_dataframe_method.<locals>.inner.<locals>.AccessorMethod'> under name 'impute' for type <class 'pandas.core.frame.DataFrame'> is overriding a preexisting attribute with the same name.
  register_dataframe_accessor(method.__name__)(AccessorMethod)
/Users/hmamin/anaconda3/lib/python3.7/site-packages/pandas_flavor/register.py:31: UserWarning: registration of accessor <class 'pandas_flavor.register.register_dataframe_method.<locals>.inner.<locals>.AccessorMethod'> under name 'target_encode' for type <class 'pandas.core.frame.DataFrame'> is overriding a preexisting attribute with the same name.
  register_dataframe_accessor(method.__name__)(AccessorMethod)
/Users/hmamin/anaconda3/lib/python3.7/site-packages/pandas_flavor/register.py:31: UserWarning: registration of accessor <class 'pandas_flavor.register.register_dataframe_method.<locals>.inner.<locals>.AccessorMethod'> under name 'top_categories' for type <class 'pandas.core.frame.DataFrame'> is overriding a preexisting attribute with the same name.
  register_dataframe_accessor(method.__name__)(AccessorMethod)
/Users/hmamin/anaconda3/lib/python3.7/site-packages/pandas_flavor/register.py:53: UserWarning: registration of accessor <class 'pandas_flavor.register.register_series_method.<locals>.inner.<locals>.AccessorMethod'> under name 'vcounts' for type <class 'pandas.core.series.Series'> is overriding a preexisting attribute with the same name.
  register_series_accessor(method.__name__)(AccessorMethod)
/Users/hmamin/anaconda3/lib/python3.7/site-packages/pandas_flavor/register.py:31: UserWarning: registration of accessor <class 'pandas_flavor.register.register_dataframe_method.<locals>.inner.<locals>.AccessorMethod'> under name 'pprint' for type <class 'pandas.core.frame.DataFrame'> is overriding a preexisting attribute with the same name.
  register_dataframe_accessor(method.__name__)(AccessorMethod)
/Users/hmamin/anaconda3/lib/python3.7/site-packages/pandas_flavor/register.py:53: UserWarning: registration of accessor <class 'pandas_flavor.register.register_series_method.<locals>.inner.<locals>.AccessorMethod'> under name 'pprint' for type <class 'pandas.core.series.Series'> is overriding a preexisting attribute with the same name.
  register_series_accessor(method.__name__)(AccessorMethod)
/Users/hmamin/anaconda3/lib/python3.7/site-packages/pandas_flavor/register.py:31: UserWarning: registration of accessor <class 'pandas_flavor.register.register_dataframe_method.<locals>.inner.<locals>.AccessorMethod'> under name 'lambda_sort' for type <class 'pandas.core.frame.DataFrame'> is overriding a preexisting attribute with the same name.
  register_dataframe_accessor(method.__name__)(AccessorMethod)
/Users/hmamin/anaconda3/lib/python3.7/site-packages/pandas_flavor/register.py:53: UserWarning: registration of accessor <class 'pandas_flavor.register.register_series_method.<locals>.inner.<locals>.AccessorMethod'> under name 'lambda_sort' for type <class 'pandas.core.series.Series'> is overriding a preexisting attribute with the same name.
  register_series_accessor(method.__name__)(AccessorMethod)
/Users/hmamin/anaconda3/lib/python3.7/site-packages/pandas_flavor/register.py:31: UserWarning: registration of accessor <class 'pandas_flavor.register.register_dataframe_method.<locals>.inner.<locals>.AccessorMethod'> under name 'coalesce' for type <class 'pandas.core.frame.DataFrame'> is overriding a preexisting attribute with the same name.
  register_dataframe_accessor(method.__name__)(AccessorMethod)
/Users/hmamin/anaconda3/lib/python3.7/site-packages/pandas_flavor/register.py:53: UserWarning: registration of accessor <class 'pandas_flavor.register.register_series_method.<locals>.inner.<locals>.AccessorMethod'> under name 'stringify' for type <class 'pandas.core.series.Series'> is overriding a preexisting attribute with the same name.
  register_series_accessor(method.__name__)(AccessorMethod)
/Users/hmamin/anaconda3/lib/python3.7/site-packages/pandas_flavor/register.py:53: UserWarning: registration of accessor <class 'pandas_flavor.register.register_series_method.<locals>.inner.<locals>.AccessorMethod'> under name 'is_list_col' for type <class 'pandas.core.series.Series'> is overriding a preexisting attribute with the same name.
  register_series_accessor(method.__name__)(AccessorMethod)

Activations

Generic ReLU.

OOP form of mish activation.

Mish: A Self Regularized Non-Monotonic Neural Activation Function
https://arxiv.org/pdf/1908.08681v1.pdf

Functional form of mish activation.

Mish: A Self Regularized Non-Monotonic Neural Activation Function
https://arxiv.org/pdf/1908.08681v1.pdf

Parameters
----------
x: torch.Tensor[float]
    Input tensor.
Returns
-------
torch.Tensor[float]: Tensor of same shape as input x.

def plot_activations(z, a, mode='scatter', **kwargs):
    """Plot an input tensor and its corresponding activations.  Both tensors
    will be flattened for plotting.
    
    Parameters
    ----------
    z: tf.Tensor
        Tensor containing values to plot on the x axis (we can often think of
        this as the output of a linear layer, where z=f(x) and a=mish(z)).
    a: tf.Tensor
        Tensor containing values to plot on y axis.
    mode: str
        'scatter' for scatter plot or 'plot' for line plot.
    kwargs: Values to be passed to the matplotlib plotting function, such as 
        's' when in 'scatter' mode or 'lw' in 'plot' mode.
        
    Returns
    -------
    None
    """
    plt_func = getattr(plt, mode)
    kwargs = kwargs or {}
    if mode == 'scatter' and not kwargs:
        kwargs = {'s': .75}
    plt_func(z.numpy().flatten(), a.numpy().flatten(), **kwargs)
    plt.axvline(0, lw=.5, alpha=.5)
    plt.axhline(0, lw=.5, alpha=.5)
    plt.show()

x = torch.arange(-5, 5, .05)
a = mish(x)

plot_activations(x, a, 'plot')

Layer Blocks

Create a convolutional block optionally followed by a batch norm layer.

conv = ConvBlock(3, 5, norm=False)
conv

ConvBlock(
  (block): Sequential(
    (0): Conv2d(3, 5, kernel_size=(3, 3), stride=(1, 1))
    (1): GReLU(leak=0.1, max=6.0, sub=0.4)
  )
)

x = torch.rand(2, 3, 4, 4)
conv(x).shape

torch.Size([2, 5, 2, 2])

Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in
a tree structure. You can assign the submodules as regular attributes::

    import torch.nn as nn
    import torch.nn.functional as F

    class Model(nn.Module):
        def __init__(self):
            super(Model, self).__init__()
            self.conv1 = nn.Conv2d(1, 20, 5)
            self.conv2 = nn.Conv2d(20, 20, 5)

        def forward(self, x):
            x = F.relu(self.conv1(x))
            return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their
parameters converted too when you call :meth:`to`, etc.

:ivar training: Boolean represents whether this module is in training or
                evaluation mode.
:vartype training: bool

ResBlock(4)

ResBlock(
  (layers): ModuleList(
    (0): ConvBlock(
      (block): Sequential(
        (0): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): ConvBlock(
      (block): Sequential(
        (0): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (activation): GReLU(leak=0.1, max=6.0, sub=0.4)
)

ResBlock(4, norm=False)

ResBlock(
  (layers): ModuleList(
    (0): ConvBlock(
      (block): Sequential(
        (0): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
    )
    (1): ConvBlock(
      (block): Sequential(
        (0): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
    )
  )
  (activation): GReLU(leak=0.1, max=6.0, sub=0.4)
)

Conv2d only allows padding_mode of `zeros` or `circular`. This
    layer is a quick way for us to use reflection padding.


Applies a 2D convolution over an input signal composed of several input
    planes.

    In the simplest case, the output value of the layer with input size
    :math:`(N, C_{\text{in}}, H, W)` and output :math:`(N, C_{\text{out}}, H_{\text{out}}, W_{\text{out}})`
    can be precisely described as:

    .. math::
        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
        \sum_{k = 0}^{C_{\text{in}} - 1} \text{weight}(C_{\text{out}_j}, k) \star \text{input}(N_i, k)


    where :math:`\star` is the valid 2D `cross-correlation`_ operator,
    :math:`N` is a batch size, :math:`C` denotes a number of channels,
    :math:`H` is a height of input planes in pixels, and :math:`W` is
    width in pixels.

    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.

    * :attr:`stride` controls the stride for the cross-correlation, a single
      number or a tuple.

    * :attr:`padding` controls the amount of implicit zero-paddings on both
      sides for :attr:`padding` number of points for each dimension.

    * :attr:`dilation` controls the spacing between the kernel points; also
      known as the à trous algorithm. It is harder to describe, but this `link`_
      has a nice visualization of what :attr:`dilation` does.

    * :attr:`groups` controls the connections between inputs and outputs.
      :attr:`in_channels` and :attr:`out_channels` must both be divisible by
      :attr:`groups`. For example,

        * At groups=1, all inputs are convolved to all outputs.
        * At groups=2, the operation becomes equivalent to having two conv
          layers side by side, each seeing half the input channels,
          and producing half the output channels, and both subsequently
          concatenated.
        * At groups= :attr:`in_channels`, each input channel is convolved with
          its own set of filters, of size:
          :math:`\left\lfloor\frac{out\_channels}{in\_channels}\right\rfloor`.

    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:

        - a single ``int`` -- in which case the same value is used for the height and width dimension
        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
          and the second `int` for the width dimension

    Note:

         Depending of the size of your kernel, several (of the last)
         columns of the input might be lost, because it is a valid `cross-correlation`_,
         and not a full `cross-correlation`_.
         It is up to the user to add proper padding.

    Note:

        When `groups == in_channels` and `out_channels == K * in_channels`,
        where `K` is a positive integer, this operation is also termed in
        literature as depthwise convolution.

        In other words, for an input of size :math:`(N, C_{in}, H_{in}, W_{in})`,
        a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments
        :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.

    Note:
        In some circumstances when using the CUDA backend with CuDNN, this operator
        may select a nondeterministic algorithm to increase performance. If this is
        undesirable, you can try to make the operation deterministic (potentially at
        a performance cost) by setting ``torch.backends.cudnn.deterministic =
        True``.
        Please see the notes on :doc:`/notes/randomness` for background.


    Args:
        in_channels (int): Number of channels in the input image
        out_channels (int): Number of channels produced by the convolution
        kernel_size (int or tuple): Size of the convolving kernel
        stride (int or tuple, optional): Stride of the convolution. Default: 1
        padding (int or tuple, optional): Zero-padding added to both sides of
            the input. Default: 0
        padding_mode (string, optional): ``'zeros'``, ``'reflect'``,
            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
        groups (int, optional): Number of blocked connections from input
            channels to output channels. Default: 1
        bias (bool, optional): If ``True``, adds a learnable bias to the
            output. Default: ``True``

    Shape:
        - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where

          .. math::
              H_{out} = \left\lfloor\frac{H_{in}  + 2 \times \text{padding}[0] - \text{dilation}[0]
                        \times (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor

          .. math::
              W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding}[1] - \text{dilation}[1]
                        \times (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor

    Attributes:
        weight (Tensor): the learnable weights of the module of shape
            :math:`(\text{out\_channels}, \frac{\text{in\_channels}}{\text{groups}},`
            :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]})`.
            The values of these weights are sampled from
            :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
            :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
        bias (Tensor):   the learnable bias of the module of shape
            (out_channels). If :attr:`bias` is ``True``,
            then the values of these weights are
            sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
            :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`

    Examples:

        >>> # With square kernels and equal stride
        >>> m = nn.Conv2d(16, 33, 3, stride=2)
        >>> # non-square kernels and unequal stride and with padding
        >>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
        >>> # non-square kernels and unequal stride and with padding and dilation
        >>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
        >>> input = torch.randn(20, 16, 50, 100)
        >>> output = m(input)

    .. _cross-correlation:
        https://en.wikipedia.org/wiki/Cross-correlation

    .. _link:
        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md

def show_img(img):
    plt.imshow(img.permute(1, 2, 0) / 255)
    plt.show()

rconv = ReflectionPaddedConv2d(3, 3, kernel_size=1, padding=2)
rconv

ReflectionPaddedConv2d(
  (reflect): ReflectionPad2d((2, 2, 2, 2))
  (conv): Conv2d(3, 3, kernel_size=(1, 1), stride=(1, 1))
)

x = torch.randint(255, (1, 3, 3, 3)).float()
show_img(x[0])

x2 = rconv.reflect(x)
show_img(x2[0])

# Tests
assert nn.Conv2d.__doc__ in ReflectionPaddedConv2d.__doc__

with assert_raises(InvalidArgumentError):
    ReflectionPaddedConv2d(3, 3, padding_mode='zeros')

As expected, got InvalidArgumentError(Remove `padding_mode` from arguments.).

Parent class of SmoothSoftmax and SmoothLogSoftmax (softmax or log
softmax with temperature baked in). There shouldn't be a need to
instantiate this class directly.

Parent class of SmoothSoftmax and SmoothLogSoftmax (softmax or log
softmax with temperature baked in). There shouldn't be a need to
instantiate this class directly.

Parent class of SmoothSoftmax and SmoothLogSoftmax (softmax or log
softmax with temperature baked in). There shouldn't be a need to
instantiate this class directly.

Apply softmax over the height and width dimensions of a batch of image
tensors (or image-like tensors). Concretely, inputs will usually have
shape (batch size, channels, height, width), while outputs will have the
same shape but values for each feature map will now sum to 1. Essentially,
we now have a heatmap of what region in each image to focus on.

Additive dropout. This injects small amounts of noise into a model
in the form of randomly generated floats from a zero-centered
gaussian distribution (variance can be adjusted). This does nothing
in eval mode. Unlike Dropout, this does not scale weights during
training since it does not bias them in any direction.

class Net(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.drop = Dropin()
        
    def forward(self, x):
        return self.drop(x)

net = Net()
x = torch.randn(8, 128, 128, 3)
assert np.corrcoef(net(x).flatten(), x.flatten())[0][1] > .9

net.eval()
assert torch.eq(net(x), x).all()
assert not net.drop.training

def simulate_activation_stats(scale=1.0, trials=10_000):
    act_stats = defaultdict(list)
    noise_stats = defaultdict(list)
    
    drop = Dropin(scale)
    for _ in range(trials):
        x = torch.randn(3, 4, dtype=torch.float)
        z = drop(x)
        noise = drop.noise
        noise_stats['mean'].append(noise.mean())
        noise_stats['std'].append(noise.std())
        noise_stats['act_corr'].append(
            np.corrcoef(z.flatten(), noise.flatten())[0][1]
        )
        
        act_stats['mean'].append(z.mean())
        act_stats['std'].append(z.std())
        act_stats['x_corr'].append(
            np.corrcoef(z.flatten(), x.flatten())[0][1]
        )

    return pd.DataFrame(dict(
        act={k: np.mean(v).round(4) for k, v in act_stats.items()}, 
        noise={k: np.mean(v).round(4) for k, v in noise_stats.items()}
    ))

for scale in [10, 1, .75, .5, .25, .1]:
    print('\n', scale)
    simulate_activation_stats(scale, 1_000).pprint()

 10

 1

 0.75

 0.5

 0.25

 0.1

This lets us easily create residual block equivalents with linear
layers.

Equivalent of ResNet block with linear layers.

Equivalent of DenseNet block with linear layers.

Like a LinearResBlock but takes a weighted average of the input and
output rather than adding them. Addition gives them equal weight and we
may want to weight the output more heavily.

More generalized version of skip connection. Eventually maybe rewrite
various res/dense/weighted conv blocks with this.

Examples
--------
>> x = torch.randn(3, 4)
>> dense = nn.Linear(4, 2)
>> dense(x).shape

torch.Size([3, 2])

>> skip = SkipConnection(dense, op='cat')
>> skip(x).shape

torch.Size([3, 6])

>> skip = SkipConnection(dense, op='add')
>> skip(x).shape

RuntimeError: The size of tensor a (4) must match the size of tensor b (2)
at non-singleton dimension 1

Embeddings and Encodings

Ported from fastai to remove dependency:

Truncated normal initialization.
From https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/12

Same as nn.Embedding but with truncated normal initialization. This
also differs from fastai's Embedding class in that it allows padding.

InitializedEmbedding(4, 3, 0).weight

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000],
        [ 0.0031, -0.0033, -0.0013],
        [ 0.0076, -0.0107,  0.0008],
        [ 0.0119, -0.0014,  0.0027]], requires_grad=True)

InitializedEmbedding(4, 3, 3).weight

Parameter containing:
tensor([[ 0.0038, -0.0103, -0.0074],
        [ 0.0104, -0.0036,  0.0039],
        [-0.0007,  0.0008,  0.0191],
        [ 0.0000,  0.0000,  0.0000]], requires_grad=True)

InitializedEmbedding(4, 3).weight

Parameter containing:
tensor([[-0.0082,  0.0058, -0.0055],
        [-0.0104, -0.0144,  0.0036],
        [ 0.0028, -0.0020, -0.0009],
        [ 0.0046, -0.0088, -0.0026]], requires_grad=True)

Bloom Embedding layer for memory-efficient word representations.
Each word is encoded by a combination of rows of the embedding
matrix. The number of rows can therefore be far lower than the number
of words in our vocabulary while still providing unique representations.
The reduction in rows allows us to use memory in other ways: a larger
embedding dimension, more or larger layers after the embedding,
larger batch sizes, etc.

Note that if hashing is done in the Dataset, we could use a simple
nn.EmbeddingBag to achieve the same thing. Many users have reported
poor performance with this layer though (especially on CPU, but in some
cases on GPU) so I stick with the standard Embedding. We also bake in
the truncated normal intialization provided by fastai, with a slight tweak
to allow a row for padding.

class Data(Dataset):
    
    def __init__(self, sentences, labels, seq_len):
        x = [s.split(' ') for s in sentences]
        self.w2i = self.make_w2i(x)
        self.seq_len = seq_len
        self.x = self.encode(x)
        self.y = torch.tensor(labels)
        
    def __getitem__(self, i):
        return self.x[i], self.y[i]
    
    def __len__(self):
        return len(self.y)
    
    def make_w2i(self, tok_rows):
        return {k: i for i, (k, v) in 
                enumerate(Counter(chain(*tok_rows)).most_common(), 1)}
    
    def encode(self, tok_rows):
        enc = np.zeros((len(tok_rows), self.seq_len), dtype=int)
        for i, row in enumerate(tok_rows):
            trunc = [self.w2i.get(w, 0) for w in row[:self.seq_len]]
            enc[i, :len(trunc)] = trunc
        return torch.tensor(enc)

sents = [
    'I walked to the store so I hope it is not closed.',
    'The theater is closed today and the sky is grey.',
    'His dog is brown while hers is grey.'
]
labels = [0, 1, 1]

ds = Data(sents, labels, 10)
ds[1]

(tensor([13, 14,  1, 15, 16, 17,  3, 18,  1,  4]), tensor(1))

dl = DataLoader(ds, batch_size=3)
x, y = next(iter(dl))
x, y

(tensor([[ 2,  5,  6,  3,  7,  8,  2,  9, 10,  1],
         [13, 14,  1, 15, 16, 17,  3, 18,  1,  4],
         [19, 20,  1, 21, 22, 23,  1,  4,  0,  0]]),
 tensor([0, 1, 1]))

x, y = next(iter(dl))
x, y

(tensor([[ 2,  5,  6,  3,  7,  8,  2,  9, 10,  1],
         [13, 14,  1, 15, 16, 17,  3, 18,  1,  4],
         [19, 20,  1, 21, 22, 23,  1,  4,  0,  0]]),
 tensor([0, 1, 1]))

be = BloomEmbedding(11, 4)
be.emb.weight

Parameter containing:
tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-1.3866e-02,  2.7083e-03,  1.9108e-03,  2.3947e-03],
        [ 5.0557e-03, -4.8774e-03,  1.5206e-03,  1.0080e-03],
        [ 4.5406e-05,  2.1932e-03, -9.5886e-03,  4.7936e-03],
        [-9.7800e-03,  1.9145e-02,  2.4422e-03,  1.2713e-02],
        [ 1.2850e-02,  4.0345e-03, -1.9255e-02, -2.5600e-03],
        [ 5.7765e-03,  1.4253e-02,  1.8160e-02, -1.6686e-02],
        [ 1.2343e-02,  2.8021e-03,  9.5432e-04,  4.2866e-03],
        [ 6.4279e-03, -3.4935e-03, -1.9902e-03, -9.9574e-03],
        [ 9.7122e-04, -6.8190e-03, -1.2612e-02, -1.6921e-03],
        [-1.1588e-02, -4.2316e-03, -9.5648e-03, -6.5988e-03]],
       requires_grad=True)

x

tensor([[ 2,  5,  6,  3,  7,  8,  2,  9, 10,  1],
        [13, 14,  1, 15, 16, 17,  3, 18,  1,  4],
        [19, 20,  1, 21, 22, 23,  1,  4,  0,  0]])

# (bs x seq_len) -> (bs -> seq_len -> emb_size)
y = be(x)
y.shape

torch.Size([3, 10, 4])

y[0]

tensor([[ 0.0363, -0.0098,  0.0052,  0.0023],
        [ 0.0337, -0.0130, -0.0147, -0.0183],
        [ 0.0075, -0.0316,  0.0120,  0.0236],
        [ 0.0304, -0.0328,  0.0388, -0.0240],
        [ 0.0144,  0.0004, -0.0022,  0.0019],
        [ 0.0084,  0.0117,  0.0089, -0.0284],
        [ 0.0363, -0.0098,  0.0052,  0.0023],
        [ 0.0177, -0.0087,  0.0344, -0.0139],
        [ 0.0272, -0.0108, -0.0036,  0.0130],
        [ 0.0035, -0.0162,  0.0048,  0.0260]], grad_fn=<SelectBackward>)

Below, we show by step how to get from x to y. This is meant to demonstrate the basic mechanism, not to show how PyTorch actually implements this under the hood. Let's look at a single row of x, corresponding to 1 sentence where each word is mapped to its index in the vocabulary.

x[0]

tensor([ 2,  5,  6,  3,  7,  8,  2,  9, 10,  1])

Next, we hash each item.

hashed = [probabilistic_hash_item(i.item(), 11, int, 4) for i in x[0]]
hashed

[[8, 2, 7, 8],
 [2, 8, 1, 2],
 [6, 6, 10, 10],
 [10, 5, 5, 5],
 [6, 9, 7, 2],
 [5, 9, 4, 0],
 [8, 2, 7, 8],
 [5, 10, 8, 9],
 [7, 8, 6, 2],
 [6, 10, 6, 0]]

Then use each row of hashed integers to index into the embedding weight matrix.

output = []
for row in hashed:
    row_out = be.emb.weight[row]
    output.append(row_out)
output = torch.stack(output)
print(output.shape)
output[:2]

torch.Size([10, 4, 4])

tensor([[[ 0.0089,  0.0007,  0.0076,  0.0034],
         [ 0.0097,  0.0003, -0.0098, -0.0082],
         [ 0.0089, -0.0114, -0.0001,  0.0037],
         [ 0.0089,  0.0007,  0.0076,  0.0034]],

        [[ 0.0097,  0.0003, -0.0098, -0.0082],
         [ 0.0089,  0.0007,  0.0076,  0.0034],
         [ 0.0054, -0.0142, -0.0027, -0.0052],
         [ 0.0097,  0.0003, -0.0098, -0.0082]]], grad_fn=<SliceBackward>)

Finally, we sum up the embedding rows. Above, each word is represented by four rows of the embedding matrix. After summing, we get a single vector for each word.

output = output.sum(-2)
output

tensor([[ 0.0363, -0.0098,  0.0052,  0.0023],
        [ 0.0337, -0.0130, -0.0147, -0.0183],
        [ 0.0075, -0.0316,  0.0120,  0.0236],
        [ 0.0304, -0.0328,  0.0388, -0.0240],
        [ 0.0144,  0.0004, -0.0022,  0.0019],
        [ 0.0084,  0.0117,  0.0089, -0.0284],
        [ 0.0363, -0.0098,  0.0052,  0.0023],
        [ 0.0177, -0.0087,  0.0344, -0.0139],
        [ 0.0272, -0.0108, -0.0036,  0.0130],
        [ 0.0035, -0.0162,  0.0048,  0.0260]], grad_fn=<SumBackward1>)

Notice that the values now match the output of our embedding layer.

assert torch.isclose(output, y[0]).all()

Axial encodings are intended to work as positional embeddings for transformer-like architectures. It's possible they could work for word embeddings as well, similar to our use of Bloom embeddings. However, the standard version of axial encodings results in similar vectors for adjacent indices - this makes some sense for positional indices, but for word indices it might require some additional preprocessing. For example, we could compress word embeddings down to 1 dimension and sort them, or simply sort by number of occurrences in our corpus which could be considered to be doing the same thing. Large chunks of the outputs vectors will be shared among different inputs, whereas Bloom embeddings seem like they would have a greater capacity to avoid this issue.

Axial encodings. These are intended to encode position in a sequence
(e.g. index in a sentence). It's possible we could adapt these for use as
word embeddings but this would likely require some experimentation (for
example, words would likely need to be sorted in a thoughtful manner
(e.g. pre-trained embeddings compressed to 1D?) since adjacent inputs will
share half of their encodings).

Adapted axial encodings to allow for more than 2 embedding matrices.
These are intended to encode position in a sequence (e.g. index in a
sentence) but might work as word embeddings. This version may be better
suited for that use case because using more blocks results in fewer shared
numbers in the output vectors of adjacent inputs.

Some experimentation is still required for this use case (for
example, words would likely need to be sorted in a thoughtful manner
(e.g. pre-trained embeddings compressed to 1D?) since adjacent inputs will
share half of their encodings).

I made this separate from AxialEncoding (at least for now) since I made a
few tweaks to the original design to make this possible and I wanted to
preserve the option to use the simpler, well-tested method
(AxialEncoding). Here, we use a probabilistic hashing scheme to map each
input to multiple embedding rows, while the original design uses
x%v and x//v.

def reduction_ratio(ax, vocab_size, emb_dim):
    """For testing purposes. Lets us compare the number of weights in a
    traditional embedding matrix vs. the number of weights in our axial
    encoding.
    """
    normal_n = vocab_size * emb_dim
    ax_n = sum(e.weight.numel() for e in ax.emb)
    print('Normal embedding weights:', normal_n)
    print('Axial encoding weights:', ax_n)
    print('Difference:', normal_n - ax_n)
    print('Ratio:', normal_n / ax_n)

vocab_size = 30_000
emb_dim = 100
bs = 12

ax = AxialEncoding(vocab_size, emb_dim)
x = torch.randint(0, vocab_size, (bs, 2))
print(x.shape)
ax

torch.Size([12, 2])

AxialEncoding(
  (emb): ModuleList(
    (0): InitializedEmbedding(174, 50)
    (1): InitializedEmbedding(174, 50)
  )
)

res = ax(x)
print(res.shape)

torch.Size([12, 2, 100])

reduction_ratio(ax, vocab_size, emb_dim)

Normal embedding weights: 3000000
Axial encoding weights: 17400
Difference: 2982600
Ratio: 172.41379310344828

vocab_size = 30_000
emb_dim = 100
bs = 12

ax = MultiAxialEncoding(vocab_size, emb_dim, 4)
x = torch.randint(0, vocab_size, (bs, 2))
print(x.shape)
ax

torch.Size([12, 2])

MultiAxialEncoding(
  (emb): ModuleList(
    (0): Embedding(14, 25)
    (1): Embedding(14, 25)
    (2): Embedding(14, 25)
    (3): Embedding(14, 25)
  )
)

res1 = ax(x)
res1.shape

torch.Size([12, 2, 100])

vocab_size = 30_000
emb_dim = 100
bs = 12

ax_pre = MultiAxialEncoding(vocab_size, emb_dim, 4, pre_hashed=True)
ax_pre

MultiAxialEncoding(
  (emb): ModuleList(
    (0): Embedding(14, 25)
    (1): Embedding(14, 25)
    (2): Embedding(14, 25)
    (3): Embedding(14, 25)
  )
)

By setting the weights of our pre-hashed embedding to the weights of our hashing embedding, we can check that the outputs are ultimately the same.

for e, e_pre in zip(ax.emb, ax_pre.emb):
    e_pre.weight.data = e.weight.data

xhash = probabilistic_hash_tensor(x, 14, 4)
res2 = ax_pre(xhash)
res2.shape

torch.Size([12, 2, 100])

(res1 == res2).all()

tensor(True)

reduction_ratio(ax_pre, vocab_size, emb_dim)

Normal embedding weights: 3000000
Axial encoding weights: 1400
Difference: 2998600
Ratio: 2142.8571428571427

I imagine that as we increase n_blocks, there's likely a point where we simply won't have enough weights to encode the amount of information that's present in the data. It would take some experimentation to find where that line is, however.

ax_large = MultiAxialEncoding(vocab_size, emb_dim, 8, pre_hashed=True)
ax_large

MultiAxialEncoding(
  (emb): ModuleList(
    (0): Embedding(4, 12)
    (1): Embedding(4, 12)
    (2): Embedding(4, 12)
    (3): Embedding(4, 12)
    (4): Embedding(4, 12)
    (5): Embedding(4, 12)
    (6): Embedding(4, 12)
    (7): Embedding(4, 12)
  )
)

reduction_ratio(ax_large, vocab_size, emb_dim)

Normal embedding weights: 3000000
Axial encoding weights: 384
Difference: 2999616
Ratio: 7812.5

Model Bases

Parent class to implement a Siamese network or triplet network (or any
network that passes n inputs of the same shape through a shared encoder).
It concatenates the items into a single batch so the encoder's forward
method (implemented as self._forward) only needs to be called once.

bs, c, h, w = 4, 3, 8, 8
n = 3
xb = [torch.randn(bs, c, h, w) for _ in range(n)]
smap(*xb)

[torch.Size([4, 3, 8, 8]), torch.Size([4, 3, 8, 8]), torch.Size([4, 3, 8, 8])]

class TripletNet(SiameseBase):
    
    def __init__(self, c_in=3):
        super().__init__()
        self.conv = nn.Conv2d(c_in, 16, kernel_size=3, stride=2)
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        
    def _forward(self, xb):
        print(xb.shape)
        xb = self.conv(xb)
        print(xb.shape)
        xb = self.pool(xb)
        print(xb.shape)
        xb = xb.squeeze(-1).squeeze(-1)
        print(xb.shape)
        return xb

In this example, each image is encoded as a 16D vector. We have 3 images per row and 4 rows per batch so we end up with a tensor of shape (4, 3, 16). Notice we only perform 1 forward pass: while we could simply define a separate encoder and pass each image through it separately (e.g. [self.encoder(x) for x in xb]), this becomes rather slow if n is large or if our encoder is enormous.

tnet = TripletNet()
yh = tnet(*xb)
yh.shape

torch.Size([12, 3, 8, 8])
torch.Size([12, 16, 3, 3])
torch.Size([12, 16, 1, 1])
torch.Size([12, 16])

torch.Size([4, 3, 16])

Our name TripletNet was slightly misleading here: the network can actually handle any choice of n. For instance, here we use it as a Siamese Net.

yh = tnet(*xb[:2])
yh.shape

torch.Size([8, 3, 8, 8])
torch.Size([8, 16, 3, 3])
torch.Size([8, 16, 1, 1])
torch.Size([8, 16])

torch.Size([4, 2, 16])

	act	noise
mean	0.0132	0.0094
std	1.8189	1.5192
x_corr	0.5324	NaN
act_corr	NaN	0.8304

Layers

Activations

`class` `GRelu`[source]

`class` `Mish`[source]

`mish`[source]

Layer Blocks

`class` `ConvBlock`[source]

`class` `ResBlock`[source]

`ReflectionPaddedConv2d`[source]

`class` `SmoothSoftmaxBase`[source]

`class` `SmoothSoftmax`[source]

`class` `SmoothLogSoftmax`[source]

`class` `SpatialSoftmax`[source]

`class` `Dropin`[source]

`class` `LinearSkipBlock`[source]

`class` `LinearResBlock`[source]

`class` `LinearDenseBlock`[source]

`class` `WeightedLinearResBlock`[source]

`class` `SkipConnection`[source]

Embeddings and Encodings

`trunc_normal_`[source]

`class` `InitializedEmbedding`[source]

`class` `BloomEmbedding`[source]

`class` `AxialEncoding`[source]

`class` `MultiAxialEncoding`[source]

Model Bases

`class` `SiameseBase`[source]

	act	noise
mean	-0.0141	0.0034
std	1.0921	0.4870
x_corr	0.8855	NaN
act_corr	NaN	0.4282

	act	noise
mean	-0.0015	0.0022
std	1.0633	0.4240
x_corr	0.9100	NaN
act_corr	NaN	0.3899

	act	noise
mean	0.0107	0.0008
std	1.0558	0.3442
x_corr	0.9409	NaN
act_corr	NaN	0.3235

	act	noise
mean	0.0098	-0.0057
std	1.0013	0.2461
x_corr	0.9667	NaN
act_corr	NaN	0.2298

	act	noise
mean	-0.0057	-0.0014
std	0.9969	0.1533
x_corr	0.9868	NaN
act_corr	NaN	0.1394

Layers

Activations

class GRelu[source]

class Mish[source]

mish[source]

Layer Blocks

class ConvBlock[source]

class ResBlock[source]

ReflectionPaddedConv2d[source]

class SmoothSoftmaxBase[source]

class SmoothSoftmax[source]

class SmoothLogSoftmax[source]

class SpatialSoftmax[source]

class Dropin[source]

class LinearSkipBlock[source]

class LinearResBlock[source]

class LinearDenseBlock[source]

class WeightedLinearResBlock[source]

class SkipConnection[source]

Embeddings and Encodings

trunc_normal_[source]

class InitializedEmbedding[source]

class BloomEmbedding[source]

class AxialEncoding[source]

class MultiAxialEncoding[source]

Model Bases

class SiameseBase[source]

`class` `GRelu`[source]

`class` `Mish`[source]

`mish`[source]

`class` `ConvBlock`[source]

`class` `ResBlock`[source]

`ReflectionPaddedConv2d`[source]

`class` `SmoothSoftmaxBase`[source]

`class` `SmoothSoftmax`[source]

`class` `SmoothLogSoftmax`[source]

`class` `SpatialSoftmax`[source]

`class` `Dropin`[source]

`class` `LinearSkipBlock`[source]

`class` `LinearResBlock`[source]

`class` `LinearDenseBlock`[source]

`class` `WeightedLinearResBlock`[source]

`class` `SkipConnection`[source]

`trunc_normal_`[source]

`class` `InitializedEmbedding`[source]

`class` `BloomEmbedding`[source]

`class` `AxialEncoding`[source]

`class` `MultiAxialEncoding`[source]

`class` `SiameseBase`[source]