from collections import Counter
import gc
import numpy as np
import spacy
import torch

from htools import hdir
a = Counter()
tmp1 = Counter('ab c')
tmp2 = Counter('a, ccc bb d ee')
print(tmp1)
print(tmp2)
print({k for k in set(list(tmp1.keys()) + list(tmp2.keys()))})
Counter({'a': 1, 'b': 1, ' ': 1, 'c': 1})
Counter({' ': 4, 'c': 3, 'b': 2, 'e': 2, 'a': 1, ',': 1, 'd': 1})
{'b', 'd', ' ', 'c', 'a', 'e', ','}
{k: tmp1.get(k, 0) + tmp2.get(k, 0) for k in {*tmp1.keys(), *tmp2.keys()}}
{'b': 3, 'd': 1, ' ': 5, 'c': 4, 'a': 2, 'e': 2, ',': 1}
!ls
scratch_add_dicts.ipynb              scratch_train.ipynb
scratch_inheritance_and_mixins.ipynb scratch_vocabulary.ipynb
class Vocabulary:
    
    def __init__(self, w2idx, w2vec=None, idx_misc=None, corpus_counts=None,
                 all_lower=True):
        """Defines a vocabulary object for NLP problems, allowing users to 
        encode text with indices or embeddings.
        
        Parameters
        -----------
        w2idx: dict[str, int]
            Dictionary mapping words to their integer index in a vocabulary. 
            The indices must allow for idx_misc to be added to the dictionary,
            so in the default case this should have a minimum index of 2. If
            a longer idx_misc is passed in, the minimum index would be larger.
        w2vec: dict[str, np.array]
            Dictionary mapping words to their embedding vectors stored as
            numpy arrays (optional).
        idx_misc: dict
            A dictionary mapping non-word tokens to indices. If none is passed
            in, a default version will be used with keys for unknown tokens
            and padding. A customized version might pass in additional tokens
            for repeated characters or all caps, for example.
        corpus_counts: collections.Counter
            Counter dict mapping words to their number of occurrences in a 
            corpus (optional).
        all_lower: bool
            Specifies whether the data you've passed in (w2idx, w2vec, i2w) is
            all lowercase. Note that this will NOT change any of this data. If
            True, it simply lowercases user-input words when looking up their
            index or vector.
        """
        if not idx_misc:
            idx_misc = {'<PAD>': 0,
                        '<UNK>': 1}
        self.idx_misc = idx_misc
        # Check that space has been left for misc keys.
        assert len(idx_misc) == min(w2idx.values())
        
        # Core data structures.
        self.w2idx = {**self.idx_misc, **w2idx}    
        self.i2w = [word for word, idx in sorted(self.w2idx.items(), 
                                                 key=lambda x: x[1])]
        self.w2vec = w2vec or dict()
        
        # Miscellaneous other attributes.
        if w2vec:
            self.dim = len(w2vec[self[-1]])
        else:
            self.dim = 1
        self.corpus_counts = corpus_counts
        self.embedding_matrix = None
        self.w2vec['<UNK>'] = np.zeros(self.dim)
        self.all_lower = all_lower
    
    @classmethod
    def from_glove_file(cls, path, max_lines=float('inf'), idx_misc=None):
        """Create a new Vocabulary object by loading GloVe vectors from a text
        file. The embeddings are all lowercase so the user does not have the
        option to set the all_lower parameter.
        
        Parameters
        -----------
        path: str
            Path to file containing glove vectors.
        max_lines: int, float (optional)
            Loading the GloVe vectors can be slow, so for testing purposes
            it can be helpful to read in a subset. If no value is provided,
            all 400,000 lines in the file will be read in.
        idx_misc: dict
            Map non-standard tokens to indices. See constructor docstring.
        """
        w2idx = dict()
        w2vec = dict()
        misc_len = 2 if not idx_misc else len(idx_misc)
        
        with open(path, 'r') as f:
            for i, line in enumerate(f):
                if i >= max_lines:
                    break
                word, *values = line.strip().split(' ')
                w2idx[word] = i + misc_len
                w2vec[word] = np.array(values, dtype=np.float)
           
        return cls(w2idx, w2vec, idx_misc)
    
    @classmethod
    def from_tokens(cls, tokens, idx_misc=None, all_lower=True):
        """Construct a Vocabulary object from a list or array of tokens.
        
        Parameters
        -----------
        tokens: list[str]
            The word-tokenized corpus.
        idx_misc: dict
            Map non-standard tokens to indices. See constructor docstring.
        all_lower: bool
            Specifies whether your tokens are all lowercase.
            
        Returns
        --------
        Vocabulary
        """
        misc_len = 2 if not idx_misc else len(idx_misc)
        counts = Counter(tokens)
        w2idx = {word: i for i, (word, freq) 
                 in enumerate(counts.most_common(), misc_len)}
        return cls(w2idx, idx_misc=idx_misc, corpus_counts=counts, 
                   all_lower=all_lower)
    
    @staticmethod
    def from_pickle(path):
        """Load a previously saved Vocabulary object.
        
        Parameters
        -----------
        path: str
            Location of pickled Vocabulary file.
            
        Returns
        --------
        Vocabulary
        """
        return torch.load(path)
    
    def save(self, path, verbose=True):
        """Pickle Vocabulary object for later use. We can then quickly load 
        the object using torch.load(path), which can be much faster than
        re-computing everything when the vocab size becomes large.
        
        Parameters
        -----------
        path: str
            Where to save the output file.
        verbose: bool
            If True, print message showing where the object was saved to.
        """
        if verbose:
            print(f'Saving vocabulary to {path}.')
        torch.save(self, path)
    
    def filter_tokens(self, tokens, max_words=None, min_freq=0, inplace=False, 
                     recompute=False):
        """Filter your vocabulary by specifying a max number of words or a min
        frequency in the corpus. When done in place, this also sorts vocab by
        frequency with more common words coming first (after idx_misc).
        
        Parameters
        -----------
        tokens: list[str]
            A tokenized list of words in the corpus (must be all lowercase 
            when self.all_lower=True, such as when using GloVe vectors). There 
            is no need to hold out test data here since we are not using 
            labels.
        max_words: int (optional)
            Provides an upper threshold for the number of words in the
            vocabulary. If no value is passed in, no maximum limit will be
            enforced.
        min_freq: int (optional)
            Provides a lower threshold for the number of times a word must 
            appear in the corpus to remain in the vocabulary. If no value is
            passed in, no minimum limit will be enforced.
            
            Note that we can specify values for both max_words and min_freq
            if desired. If no values are passed in for either, no pruning of
            the vocabulary will be performed.
        inplace: bool
            If True, will change the object's attributes 
            (w2idx, w2vec, and i2w) to reflect the newly filtered vocabulary.
            If False, will not change the object, but will simply compute word
            counts and return what the new w2idx would be. This can be helpful
            for experimentation, as we may want to try out multiple values of
            min_freq to decide how many words to keep. After the first call,
            the attribute corpus_counts can also be examined to help determine
            the desired vocab size.
        recompute: bool
            If True, will calculate word counts from the given tokens. If 
            False (the default), this will use existing counts if there are 
            any. 
            
            The idea is that if we call this method, then realize we want
            to change the corpus, we should calculate new word counts. 
            However, if we are simply calling this method multiple times on 
            the same corpus while deciding on the exact vocab size we want,
            we should not recompute the word counts.
            
        Returns
        --------
        dict or None: When called inplace, nothing is returned. When not 
        inplace, 
        """
        misc_len = len(self.idx_misc)
        if recompute or not self.corpus_counts:
            self.corpus_counts = Counter(tokens)
        filtered = {word: i for i, (word, freq) 
                    in enumerate(self.corpus_counts.most_common(max_words),
                                 misc_len) 
                    if freq >= min_freq}
        filtered = {**self.idx_misc, **filtered}
        
        if inplace:
            # Relies on python3.7 dicts retaining insertion order.
            self.i2w = list(filtered.keys())
            self.w2idx = filtered
            self.w2vec = {word: self.vector(word) for word in filtered}
        else:
            return filtered
        
    def build_embedding_matrix(self, inplace=False):
        """Create a 2D numpy array of embedding vectors where row[i] 
        corresponds to word i in the vocabulary. This can be used to 
        initialize weights in the model's embedding layer.
        
        Parameters
        -----------
        inplace: bool
            If True, will store the output in the object's embedding_matrix
            attribute. If False (default behavior), will simply return the 
            matrix without storing it as part of the object. In the 
            recommended case where inplace==False, we can store the output
            in another variable which we can use to initialize the weights in
            Torch, then delete the object and free up memory using 
            gc.collect().
        """
        emb = np.zeros((len(self), self.dim))
        for i, word in enumerate(self):
            emb[i] = self.vector(word)
            
        if inplace:
            self.embedding_matrix = emb
        else:
            return emb
    
    def idx(self, word):
        """This will map a word (str) to its index (int) in the vocabulary. 
        If a string is passed in and the word is not present, the index
        corresponding to the <UNK> token is returned.
        
        Parameters
        -----------
        word: str
            A word that needs to be mapped to an integer index.
            
        Returns
        --------
        int: The index of the given word in the vocabulary.
            
        Examples
        ---------
        >>> vocab.idx('the')
        2
        """
        if self.all_lower and word not in self.idx_misc:
            word = word.lower()
        return self.w2idx.get(word, self.w2idx['<UNK>'])
    
    def vector(self, word):
        """This maps a word to its corresponding embedding vector. If not
        contained in the vocab, a vector of zeros will be returned.
        
        Parameters
        -----------
        word: str
            A word that needs to be mapped to a vector.
            
        Returns
        --------
        np.array
        """
        if self.all_lower and word not in self.idx_misc:
            word = word.lower()
        return self.w2vec.get(word, self.w2vec['<UNK>'])
        
    def encode(self, text, nlp, max_len, pad_end=True, trim_start=True):
        """Encode text so that each token is replaced by its integer index in 
        the vocab.
        
        Parameters
        -----------
        text: str
            Raw text to be encoded.
        nlp: spacy.lang.en.English
            Spacy tokenizer. Typically want to disable 'parser', 'tagger', and
            'ner' as they aren't used here and slow down the encoding process.
        max_len: int
            Length of output encoding. If text is shorter, it will be padded 
            to fit the specified length. If text is longer, it will be 
            trimmed.
        pad_end: bool
            If True, add padding to the end of short sentences. If False, pad
            the start of these sentences.
        trim_start: bool
            If True, trim off the start of sentences that are too long. If 
            False, trim off the end.
            
        Returns
        --------
        np.array[int]: Array of length max_len containing integer indices
            corresponding to the words passed in.
        """
        output = np.ones(max_len) * self.idx('<PAD>')
        encoded = [self.idx(tok.text) for tok in nlp(text)]
        
        # Trim sentence in case it's longer than max_len.
        if len(encoded) > max_len:
            if trim_start:
                encoded = encoded[len(encoded) - max_len:]
            else:
                encoded = encoded[:max_len]

        # Replace zeros at start or end, depending on choice of pad_end.
        if pad_end:
            output[:len(encoded)] = encoded
        else:
            output[max_len-len(encoded):] = encoded
        return output.astype(int)
    
    def decode(self, idx):
        """Convert a list of indices to a string of words/tokens.
        
        Parameters
        -----------
        idx: list[int]
            A list of integers indexing into the vocabulary. This will often 
            be the output of the encode() method.
            
        Returns
        --------
        list[str]: A list of words/tokens reconstructed by indexing into the 
            vocabulary.
        """
        return [self[i] for i in idx]
    
    def __getitem__(self, i):
        """This will map an index (int) to a word (str).
        
        Parameters
        -----------
        i: int
            Integer index for a word.
            
        Returns
        --------
        str: Word corresponding to the given index.
            
        Examples
        ---------
        >>> vocab = Vocabulary(w2idx, w2vec)
        >>> vocab[1]
        '<UNK>'
        """
        return self.i2w[i]
    
    def __len__(self):
        """Number of words in vocabulary."""
        return len(self.w2idx)
    
    def __iter__(self):
        for word in self.w2idx.keys():
            yield word
            
    def __contains__(self, word):
        return word in self.w2idx.keys()
    
    def __eq__(self, obj):
        if not isinstance(obj, Vocabulary):
            return False
        
        ignore = {'w2vec', 'embedding_matrix'}
        attrs = [k for k, v in hdir(vocab).items() 
                 if v == 'attribute' and k not in ignore]
        return all([getattr(self, attr) == getattr(obj, attr) 
                    for attr in attrs])
    
    def __repr__(self):
        msg = f'Vocabulary({len(self)} words'
        if self.dim > 1:
            msg += f', {self.dim}-D embeddings'
        return msg + ')'
nlp = spacy.load('en_core_web_sm')
glove_path = '/Users/hmamin/data/glove/glove.6B.100d.txt'

vocab = Vocabulary.from_glove_file(glove_path, 5_000)
vocab
Vocabulary(5002 words, 100-D embeddings)
len(vocab)
5002
vocab.save('tmp.pkl')
Saving vocabulary to tmp.pkl.
v2 = Vocabulary.from_pickle('tmp.pkl')
vocab == v2
True
assert v2.w2idx == vocab.w2idx
assert v2.i2w == vocab.i2w
assert v2.corpus_counts == vocab.corpus_counts
assert v2.idx_misc == vocab.idx_misc
assert v2.dim == vocab.dim
assert len(v2.w2vec) == len(vocab.w2vec)
del v2
gc.collect()
85
vocab[:5]
['<PAD>', '<UNK>', 'the', ',', '.']
for i in range(5):
    word = vocab[i]
    print(i, word, vocab.idx(word), vocab.vector(word)[:3])
0 <PAD> 1 [0. 0. 0.]
1 <UNK> 1 [0. 0. 0.]
2 the 2 [-0.038194 -0.24487   0.72812 ]
3 , 3 [-0.10767  0.11053  0.59812]
4 . 4 [-0.33979  0.20941  0.46348]
emb = vocab.build_embedding_matrix()
print(vocab.embedding_matrix)
emb[:10, :5]
None
array([[ 0.      ,  0.      ,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      ,  0.      ,  0.      ],
       [-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172],
       [-0.10767 ,  0.11053 ,  0.59812 , -0.54361 ,  0.67396 ],
       [-0.33979 ,  0.20941 ,  0.46348 , -0.64792 , -0.38377 ],
       [-0.1529  , -0.24279 ,  0.89837 ,  0.16996 ,  0.53516 ],
       [-0.1897  ,  0.050024,  0.19084 , -0.049184, -0.089737],
       [-0.071953,  0.23127 ,  0.023731, -0.50638 ,  0.33923 ],
       [ 0.085703, -0.22201 ,  0.16569 ,  0.13373 ,  0.38239 ],
       [-0.27086 ,  0.044006, -0.02026 , -0.17395 ,  0.6444  ]])
del emb
gc.collect()
1525
# vocab.build_embedding_matrix(True)
# vocab.embedding_matrix.shape
raw = """I went to the beach today. It was sunny and warm. The water was
very blue and cold. The relaxing atmosphere provided a welcome break from the
daily routine. beach sunny sunny sunny sunny atmosphere atmosphere atmosphere
blue blue blue blue blue. beach next beach atmosphere"""
corpus = [t.text for t in nlp(raw.lower(), disable=['parser', 'tagger', 'ner'])]
print(len(corpus))

filtered = vocab.filter_tokens(corpus, max_words=15)
print(len(filtered))
filtered
54
17
{'<PAD>': 0,
 '<UNK>': 1,
 'blue': 2,
 '.': 3,
 'sunny': 4,
 'atmosphere': 5,
 'the': 6,
 'beach': 7,
 '\n': 8,
 'was': 9,
 'and': 10,
 'i': 11,
 'went': 12,
 'to': 13,
 'today': 14,
 'it': 15,
 'warm': 16}
vocab.corpus_counts
Counter({'i': 1,
         'went': 1,
         'to': 1,
         'the': 4,
         'beach': 4,
         'today': 1,
         '.': 5,
         'it': 1,
         'was': 2,
         'sunny': 5,
         'and': 2,
         'warm': 1,
         'water': 1,
         '\n': 3,
         'very': 1,
         'blue': 6,
         'cold': 1,
         'relaxing': 1,
         'atmosphere': 5,
         'provided': 1,
         'a': 1,
         'welcome': 1,
         'break': 1,
         'from': 1,
         'daily': 1,
         'routine': 1,
         'next': 1})
len(vocab)
5001
vocab.filter_tokens(corpus, min_freq=2, inplace=True)
len(vocab)
11
vocab.w2idx
{'<PAD>': 0,
 '<UNK>': 1,
 'blue': 2,
 '.': 3,
 'sunny': 4,
 'atmosphere': 5,
 'the': 6,
 'beach': 7,
 '\n': 8,
 'was': 9,
 'and': 10}
vocab.i2w
['<PAD>',
 '<UNK>',
 'blue',
 '.',
 'sunny',
 'atmosphere',
 'the',
 'beach',
 '\n',
 'was',
 'and']
vocab.idx('BLUE')
2
vocab.all_lower = False
vocab.idx('BLUE')
1
vocab.all_lower = True
(vocab.vector('blue') == vocab.vector('BLUE')).all()
True
# vocab.save('../data/vocab.pkl')
# vocab2 = torch.load('../data/vocab.pkl')
# print(vocab.w2idx == vocab2.w2idx)
# del vocab2
# gc.collect()

custom idx_misc

custom_misc = {'<UNK>': 0, '<PAD>': 0, '<XXrep>': 2, '<XXcaps>': 3}
vocab = Vocabulary.from_glove_file(glove_path.replace('100', '50'), 5_000, custom_misc)
vocab
Vocabulary(5004 words, 50-D embeddings)
vocab.dim
50
[vocab.idx(word) for word in ('the', 'boat', 'house', '.')]
[4, 2381, 170, 6]

Without Glove

def tokenize(text):
    return [t.text for t in nlp(text, disable=['parser', 'tagger', 'ner'])]
def encode(text, vocab):
    return [vocab.idx(t) for t in tokenize(text)]
text_path = '/Users/hmamin/data/bbc/tech/401.txt'
with open(text_path, 'r') as f:
    tokens = tokenize(f.read())

vocab = Vocabulary.from_tokens(tokens, all_lower=False)
vocab
Vocabulary(988 words)
vocab.dim
1
vocab.vector('the')
array([0.])
vocab.idx('The')
60
vocab.idx('the')
4
for i in range(10):
    word = vocab[i]
    print(i, word, vocab.idx(word), vocab.vector(word))
0 <PAD> 0 [0.]
1 <UNK> 1 [0.]
2 . 2 [0.]
3 , 3 [0.]
4 the 4 [0.]
5 I 5 [0.]
6 to 6 [0.]
7 a 7 [0.]
8 of 8 [0.]
9 and 9 [0.]
vocab.filter_tokens(tokens, max_words=15, min_freq=5, inplace=True)
vocab
Vocabulary(17 words)
vocab[:10]
['<PAD>', '<UNK>', '.', ',', 'the', 'I', 'to', 'a', 'of', 'and']
vocab.build_embedding_matrix()
array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]])

Test encoding

vocab = Vocabulary.from_glove_file('/Users/hmamin/data/glove/glove.6B.50d.txt', 5_000)
vocab
Vocabulary(5002 words, 50-D embeddings)
text = "It's kind of a really nice day."
np.ones(5) * 0
array([0., 0., 0., 0., 0.])
encoded = vocab.encode(text, nlp, 10)
encoded
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[22, 11, 923, 5, 9, 590, 3084, 124, 4]
0 0
array([  22,   11,  923,    5,    9,  590, 3084,  124,    4,    0])
vocab.decode(encoded)
"it 's kind of a really nice day . <PAD>"
vocab[1]
'<UNK>'
vocab[:6]
['<PAD>', '<UNK>', 'the', ',', '.', 'of']
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])
class Encoder:
    
    def __init__(self, nlp, vocab, max_len, pad_end=True, trim_start=True):
        self.nlp = nlp
        self.vocab = vocab
        self.max_len = max_len
        self.pad_end = pad_end
        self.trim_start = trim_start
        self.pad_idx = self.vocab.idx('<PAD>')
        
    def encode(self, text):
        output = np.zeros(self.max_len)
        encoded = [self.vocab.idx(tok.text) 
                   for tok in self.nlp(text.lower())]
        
        # Trim sentence in case it's longer than max_len.
        if len(encoded) > self.max_len:
            if self.trim_start:
                encoded = encoded[len(encoded) - self.max_len:]
            else:
                encoded = encoded[:self.max_len]

        # Replace zeros at start or end, depending on choice of pad_end.
        if self.pad_end:
            output[:len(encoded)] = encoded
        else:
            output[self.max_len-len(encoded):] = encoded
        return output
for pad_end_bool in (True, False):
    for trim_start_bool in (True, False):
        print('\nPAD END', pad_end_bool, 'TRIM START', trim_start_bool)
        print(vocab.encode(text, nlp, 10, pad_end_bool, trim_start_bool))
        print(vocab.encode(text_long, nlp, 10, pad_end_bool, trim_start_bool))
PAD END True TRIM START True
[2.200e+01 1.100e+01 9.230e+02 5.000e+00 9.000e+00 5.900e+02 3.084e+03
 1.240e+02 4.000e+00 1.000e+00]
[  16. 2239.    3.    7.   22.   16.    9. 1868.  366.    4.]

PAD END True TRIM START False
[2.200e+01 1.100e+01 9.230e+02 5.000e+00 9.000e+00 5.900e+02 3.084e+03
 1.240e+02 4.000e+00 1.000e+00]
[2.000e+00 1.571e+03 3.400e+01 4.131e+03 3.000e+00 2.000e+00 3.507e+03
 1.600e+01 2.239e+03 3.000e+00]

PAD END False TRIM START True
[1.000e+00 2.200e+01 1.100e+01 9.230e+02 5.000e+00 9.000e+00 5.900e+02
 3.084e+03 1.240e+02 4.000e+00]
[  16. 2239.    3.    7.   22.   16.    9. 1868.  366.    4.]

PAD END False TRIM START False
[1.000e+00 2.200e+01 1.100e+01 9.230e+02 5.000e+00 9.000e+00 5.900e+02
 3.084e+03 1.240e+02 4.000e+00]
[2.000e+00 1.571e+03 3.400e+01 4.131e+03 3.000e+00 2.000e+00 3.507e+03
 1.600e+01 2.239e+03 3.000e+00]
encoder = Encoder(nlp, vocab, 10)
encoder.encode(text)
array([  22.,   11.,  923.,    5.,    9.,  590., 3084.,  124.,    4.,
          0.])
text_long = "The stars are bright, the sky is dark, and it is a cold night."
encoder.encode(text_long)
array([2.000e+00, 1.571e+03, 3.400e+01, 4.131e+03, 3.000e+00, 2.000e+00,
       3.507e+03, 1.600e+01, 2.239e+03, 3.000e+00])
encoder = Encoder(nlp, vocab, 10, pad_end=False, trim_start=True)
encoder.encode(text_long)
array([2.000e+00, 1.571e+03, 3.400e+01, 4.131e+03, 3.000e+00, 2.000e+00,
       3.507e+03, 1.600e+01, 2.239e+03, 3.000e+00])
encoder.encode(text)
array([   0.,   22.,   11.,  923.,    5.,    9.,  590., 3084.,  124.,
          4.])
[t.text for t in nlp(text_long)]
['The',
 'stars',
 'are',
 'bright',
 ',',
 'the',
 'sky',
 'is',
 'dark',
 ',',
 'and',
 'it',
 'is',
 'a',
 'cold',
 'night',
 '.']
vocab[3]
','