%load_ext autoreload
%autoreload 2
%matplotlib inline
# Only needed for testing.
from collections import Counter
from itertools import chain
import numpy as np
from torch.utils.data import DataLoader
from htools import eprint, assert_raises
sents = [
'I walked to the store so I hope it is not closed.',
'The theater is closed today and the sky is grey.',
'His dog is brown while hers is grey.'
]
labels = [0, 1, 1]
class Data(Dataset):
def __init__(self, sentences, labels, seq_len):
x = [s.split(' ') for s in sentences]
self.w2i = self.make_w2i(x)
self.seq_len = seq_len
self.x = self.encode(x)
self.y = torch.tensor(labels)
def __getitem__(self, i):
return self.x[i], self.y[i]
def __len__(self):
return len(self.y)
def make_w2i(self, tok_rows):
return {k: i for i, (k, v) in
enumerate(Counter(chain(*tok_rows)).most_common(), 1)}
def encode(self, tok_rows):
enc = np.zeros((len(tok_rows), self.seq_len), dtype=int)
for i, row in enumerate(tok_rows):
trunc = [self.w2i.get(w, 0) for w in row[:self.seq_len]]
enc[i, :len(trunc)] = trunc
return torch.tensor(enc)
We construct a toy dataset with a vocabulary of size 23. In reality, you might wish to lowercase text or use a better tokenizer, but this is sufficient for the purposes of demonstration.
ds = Data(sents, labels, 10)
len(ds.w2i)
dl = DataLoader(ds, batch_size=3)
x, y = next(iter(dl))
x, y
x.shape
We hash each word index 4 times, as specified by the n_hashes
parameter in probabilistic_hash_tensor
. Notice that we only use 7 buckets, meaning the embedding matrix will have 7 rows rather than 23 (not counting a padding row).
x_hashed = probabilistic_hash_tensor(x, n_buckets=7, n_hashes=4)
print('x shape:', x.shape)
print('x_hashed shape:', x_hashed.shape)
Below, each row of 4 numbers encodes a single word.
x_hashed
See how each word is mapped to a list of 4 indices.
for word, i in zip(sents[0].split(' '), x[0]):
print(word, probabilistic_hash_item(i.item(), 7, int, 4))
Notice that hashing the words directly is also possible, but the resulting hashes will be different than if hashing after encoding words as integers. This is fine as long as you are consistent.
for row in [s.split(' ') for s in sents]:
eprint(list(zip(row, (probabilistic_hash_item(word, 11, str) for word in row))))
print()
Below, we show that we can obtain unique representations for >99.9% of words in a vocabulary of 30,000 words with a far smaller embedding matrix. The number of buckets is the number of rows in the embedding matrix.
def unique_combos(tups):
return len(set(tuple(sorted(x)) for x in tups))
def hash_all_idx(vocab_size, n_buckets, n_hashes):
return [probabilistic_hash_item(i, n_buckets, int, n_hashes)
for i in range(vocab_size)]
vocab_size = 30_000
buckets2hashes = {127: 5,
251: 4,
997: 3,
5_003: 2}
for b, h in buckets2hashes.items():
tups = hash_all_idx(vocab_size, b, h)
unique = unique_combos(tups)
print('\n\nBuckets:', b, '\nHashes:', h, '\nUnique combos:', unique,
'\n% unique:', round(unique/30_000, 4))
@auto_repr
class LazyDataset(Dataset):
"""Lazily load batches from an enormous dataframe that can't fit into
memory.
"""
def __init__(self, df_path, length, shuffle, chunksize=1_000,
c=2, classes=('neg', 'pos'), **kwargs):
"""
Parameters
----------
df_path: str
File path of dataframe to load.
length: int
Number of rows of data to use. This is required so that we don't
have to go through the whole file and count the number of lines,
which can be enormous with a big dataset. It also makes it easy to
work with a subset (the data should already be shuffled, so
choosing the top n rows is fine).
shuffle: bool
If True, shuffle the data in each chunk. Note that if batch size
is close to chunk size, this will have minimal effect. If possible,
the training set should therefore load as large a chunk as
possible if we want to shuffle the data. Shuffling is unnecessary
for the validation set.
chunksize: int
Number of rows of df to load at a time. This should usually
be significantly larger than the batch size in order to retain
some randomness in the batches.
c: int
Number of classes. Used if training with FastAI.
classes: iterable
List of tuple of class names. Used if training with FastAI.
kwargs: any
Additional keyword arguments to pass to `read_csv`, eg.
compression='gzip'.
"""
if length < chunksize:
warnings.warn('Total # of rows < 1 full chunk. LazyDataset may '
'not be necessary.')
self.length = length
self.shuffle = shuffle
self.chunksize = chunksize
self.df_path = df_path
self.df = None
self.chunk = None
self.chunk_idx = None
self.df_kwargs = kwargs
# Additional attributes required by FastAI.
# c: Number of classes in model.
self.c = c
self.classes = list(classes)
def __len__(self):
return self.length
def __getitem__(self, idx):
"""Because not all indices are loaded at once, we must do shuffling
in the dataset rather than the dataloader (e.g. if the loader randomly
samples index 5000 but we have indices 0-500 loaded, it will be
unavailable).
Parameters
----------
idx: int
Retrieve item i in dataset.
Returns
-------
tuple[np.array]: x array, y array
"""
# Load next chunk of data if necessary. Must specify nrows, otherwise
# we will chunk through the whole file.
if not self.chunk_idx:
while True:
try:
self.chunk = self.df.get_chunk()
break
except (AttributeError, StopIteration):
self.df = pd.read_csv(self.df_path, engine='python',
chunksize=self.chunksize,
nrows=len(self),
**self.df_kwargs)
self.chunk_idx = self.chunk.index.values
if self.shuffle: np.random.shuffle(self.chunk_idx)
self.chunk_idx = deque(self.chunk_idx)
*x, y = self.chunk.loc[self.chunk_idx.popleft()].values
return np.array(x), y.astype(float)
class ImageMixer:
"""The transformation that powers MixupDS.
Inspired by the "Visual-Spatial - Entangled Figures" task here:
http://www.happy-neuron.com/brain-games/visual-spatial/entangled-figures
The key idea: when a human plays the happy neuron task, it is much easier
when viewing an entanglement of objects they recognize (e.g. bicycle, leaf,
etc.) than of random scribbles. I noticed that for the harder levels, my
strategy was to try to quickly identify a couple distinctive features, then
search for them in the individual images once they appeared. This quick
feature extraction seems very close to what we want to achieve during the
pre-training step.
"""
def __init__(self, n=3, a=5, b=8, dist=None, **kwargs):
"""
Parameters
----------
n: int
Number of images to use as inputs. With the current implementation,
the constructed image will use exactly 2 of these. The rest will
be negatives (zero weight).
a: int
Parameter in beta distribution.
b: int
Parameter in beta distribution.
dist: torch.distribution
This can be anything with a "sample()" method that generates a
random value between 0 and 1. By default, we use a Beta
distribution. If one is passed in, a and b are ignored.
kwargs: any
Makes it easier to use in `get_databunch` function. Extra kwargs
are ignored.
"""
assert n >= 2, 'n must be >=2 so we can combine images.'
self.dist = dist or torch.distributions.beta.Beta(a, b)
self.n = n
def transform(self, *images):
"""Create linear combination of images.
Parameters
----------
images: torch.tensors
Returns
-------
tuple(torch.tensor): First item is (n_channel, h, w*self.n), meaning
images are horizontally stacked. The first of these is the new image.
The second item is the rank 1 tensor of weights used to generate the
combination. These will serve as labels in our self-supervised task.
"""
w = self._generate_weights()
return (self._combine_images(images, w), *images), w
def _generate_weights(self):
"""
Returns
-------
weights: torch.Tensor
Vector of length self.n. Exactly 2 of these values are
nonzero and they sum to 1. This will be used to compute a linear
combination of a row of images.
"""
weights = np.zeros(self.n)
p = self.dist.sample()
indices = np.random.choice(self.n, size=2, replace=False)
weights[indices] = p, 1 - p
return torch.tensor(weights, dtype=torch.float)
def _combine_images(self, images, weights):
"""Create linear combination of multiple images.
Parameters
----------
images: torch.Tensor
weights: torch.Tensor
Vector with 1 value for each image. Exactly 2 of these values are
nonzero and they sum to 1. I.e. if we have 3 images a, b, and c,
weights would look something like [0, .3, .7].
Returns
-------
torch.tensor: Shape (channels, height, width), same as each of the
input images.
"""
images = torch.stack(images, dim=0)
# 3 new dimensions correspond to (c, h, w), NOT self.n.
return (weights[:, None, None, None] * images).sum(0).float()
Below, we define a few toy functions to demonstrate how we can alter an input string. In practice, we'd use more useful transformations like the ones defined in incendio.nlp
, e.g. ParaphraseTransform
.
def to_upper(t):
return t.upper()
def times_3(t):
return t * 3
def join(t, sep='---'):
return sep.join(t)
text = 'dog'
pipeline = RandomPipeline(to_upper, times_3, join)
pipeline
for i in range(5):
print(pipeline(text))
pipeline = RandomPipeline(to_upper, times_3, join, p=[1., .4, 1])
for i in range(5):
print(pipeline(text))
with assert_raises(ValueError):
pipeline = RandomPipeline(join, p=0)
with assert_raises(AssertionError):
pipeline = RandomPipeline(to_upper, times_3, join, p=[.2, 1])
transforms = {times_3: .33,
join: .67,
to_upper: .95}
pipeline = RandomPipeline.from_dict(transforms)
for i in range(5):
print(pipeline(text))
up = BotoUploader('gg-datascience')
ft = up._convert_local_path('data/v1/history.csv')
tt = up._convert_local_path('data/v1/history.csv', 'hmamin')
tf = up._convert_local_path('data/v1/history.csv', 'hmamin', retain_tree=False)
ff = up._convert_local_path('data/v1/history.csv', retain_tree=False)
print('No S3 prefix, Yes retain file tree:\n' + ft)
print('\nYes S3 prefix, Yes retain file tree:\n' + tt)
print('\nYes S3 prefix, No retain file tree:\n' + tf)
print('\nNo S3 prefix, No retain file tree:\n' + ff)
Below, we demonstrate plotting a list of images stored as numpy arrays in a grid.
images = [np.clip(5 * np.random.uniform(size=(3, 16, 16)) / i, 0, 1)
for i in range(1, 17)]
plot_images(images, titles=[f'Image {i}' for i in range(16)])
Here is a similar example showcasing the title color functionality. Notice we can use tensors here too.
images = [torch.clamp(5 * torch.rand(3, 16, 16) / i, 0, 1)
for i in range(1, 17)]
plot_images(images, titles=[f'Image {i}' for i in range(16)],
title_colors=['red' if i%2 == 0 else 'green' for i in range(16)])