Skip to content

medcat.vocab

Classes:

  • Vocab

    Vocabulary used to store word embeddings for context similarity

Attributes:

WordDescriptor module-attribute

WordDescriptor = TypedDict('WordDescriptor', {'vector': Optional[ndarray], 'count': int, 'index': int})

logger module-attribute

logger = getLogger(__name__)

Vocab

Vocab()

Bases: AbstractSerialisable

Vocabulary used to store word embeddings for context similarity calculation. Also used by the spell checker - but not for fixing the spelling only for checking is something correct.

Properties

vocab (dict[str, WordDescriptor]): Map from word to attributes, e.g. {'house': {'vector': , 'count': , ...}, ...} index2word (dict[int, str]): From word to an index - used for negative sampling vec_index2word (dict): Same as index2word but only words that have vectors

Methods:

Attributes:

Source code in medcat-v2/medcat/vocab.py
41
42
43
44
45
46
def __init__(self) -> None:
    super().__init__()
    self.vocab: dict[str, WordDescriptor] = {}
    self.index2word: dict[int, str] = {}
    self.vec_index2word: dict[int, str] = {}
    self.cum_probs: np.ndarray = np.array([])

cum_probs instance-attribute

cum_probs: ndarray = array([])

index2word instance-attribute

index2word: dict[int, str] = {}

vec_index2word instance-attribute

vec_index2word: dict[int, str] = {}

vocab instance-attribute

vocab: dict[str, WordDescriptor] = {}

add_vec

add_vec(word: str, vec: ndarray) -> None

Add vector to a word.

Parameters:

  • word

    (str) –

    To which word to add the vector.

  • vec

    (ndarray) –

    The vector to add.

Source code in medcat-v2/medcat/vocab.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def add_vec(self, word: str, vec: np.ndarray) -> None:
    """Add vector to a word.

    Args:
        word(str):
            To which word to add the vector.
        vec(np.ndarray):
            The vector to add.
    """
    self.vocab[word]['vector'] = vec

    ind = self.vocab[word]['index']
    if ind not in self.vec_index2word:
        self.vec_index2word[ind] = word

add_word

add_word(word: str, cnt: int = 1, vec: Optional[ndarray] = None, replace: bool = True) -> None

Add a word to the vocabulary

Parameters:

  • word

    (str) –

    The word to be added, it should be lemmatized and lowercased

  • cnt

    (int, default: 1 ) –

    Count of this word in your dataset (Default value = 1)

  • vec

    (Optional[ndarray], default: None ) –

    The vector representation of the word (Default value = None)

  • replace

    (bool, default: True ) –

    Will replace old vector representation (Default value = True)

Source code in medcat-v2/medcat/vocab.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
def add_word(self, word: str, cnt: int = 1,
             vec: Optional[np.ndarray] = None,
             replace: bool = True) -> None:
    """Add a word to the vocabulary

    Args:
        word (str):
            The word to be added, it should be lemmatized and lowercased
        cnt (int):
            Count of this word in your dataset (Default value = 1)
        vec (Optional[np.ndarray]):
            The vector representation of the word (Default value = None)
        replace (bool):
            Will replace old vector representation (Default value = True)
    """
    if word not in self.vocab:
        # NOTE: If one were to manually remove a word, this could have
        #       issues, but the Vocab should - in general - be pretty
        #       stable, so shouldn't be an issue
        ind = len(self.index2word)
        self.index2word[ind] = word
        item: WordDescriptor = {'vector': vec, 'count': cnt, 'index': ind}
        self.vocab[word] = item

        if vec is not None:
            self.vec_index2word[ind] = word
    elif replace and vec is not None:
        word_info = self.vocab[word]
        word_info['vector'] = vec
        word_info['count'] = cnt

        # If this word didn't have a vector before
        ind = word_info['index']
        if ind not in self.vec_index2word:
            self.vec_index2word[ind] = word

add_words

add_words(path: str, replace: bool = True) -> None

Adds words to the vocab from a file, the file is required to have the following format (vec being optional): [ ]

e.g. one line: the word house with 3 dimensional vectors house 34444 0.3232 0.123213 1.231231

Parameters:

  • path

    (str) –

    path to the file with words and vectors

  • replace

    (bool, default: True ) –

    existing words in the vocabulary will be replaced. Defaults to True.

Source code in medcat-v2/medcat/vocab.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
def add_words(self, path: str, replace: bool = True) -> None:
    """Adds words to the vocab from a file, the file
    is required to have the following format (vec being optional):
        <word>\t<cnt>[\t<vec_space_separated>]

    e.g. one line: the word house with 3 dimensional vectors
        house   34444   0.3232 0.123213 1.231231

    Args:
        path(str):
            path to the file with words and vectors
        replace(bool):
            existing words in the vocabulary will be replaced.
            Defaults to True.
    """
    with open(path) as f:
        for line in f:
            parts = line.split("\t")
            word = parts[0]
            cnt = int(parts[1].strip())
            vec = None
            if len(parts) == 3:
                floats = [float(x) for x in parts[2].strip().split(" ")]
                vec = np.array(floats)

            self.add_word(word, cnt, vec, replace)

count

count(word: str) -> int
Source code in medcat-v2/medcat/vocab.py
281
282
def count(self, word: str) -> int:
    return self.vocab[word]['count']

get_negative_samples

get_negative_samples(n: int = 6, ignore_punct_and_num: bool = False) -> list[int]

Get N negative samples.

Parameters:

  • n

    (int, default: 6 ) –

    How many words to return (Default value = 6)

  • ignore_punct_and_num

    (bool, default: False ) –

    Whether to ignore punctuation and numbers. Defaults to False.

Raises:

  • Exception

    If no unigram table is present.

Returns:

  • list[int]

    list[int]: Indices for words in this vocabulary.

Source code in medcat-v2/medcat/vocab.py
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
def get_negative_samples(self, n: int = 6,
                         ignore_punct_and_num: bool = False) -> list[int]:
    """Get N negative samples.

    Args:
        n (int):
            How many words to return (Default value = 6)
        ignore_punct_and_num (bool):
            Whether to ignore punctuation and numbers. Defaults to False.

    Raises:
        Exception: If no unigram table is present.

    Returns:
        list[int]:
            Indices for words in this vocabulary.
    """
    if len(self.cum_probs) == 0:
        self.init_cumsums()
    random_vals = np.random.rand(n)
    # NOTE: These indices are in terms of the cum_probs array
    #       which only has word data for words with vectors.
    vec_slots = cast(
        list[int], np.searchsorted(self.cum_probs, random_vals).tolist())
    # so we need to translate these back to word indices
    inds = list(map(self._index_list.__getitem__, vec_slots))

    if ignore_punct_and_num:
        # Do not return anything that does not have letters in it
        return [ind for ind in inds
                if self.index2word[ind].upper().isupper()]
    return inds

get_vectors

get_vectors(indices: list[int]) -> list[ndarray]
Source code in medcat-v2/medcat/vocab.py
271
272
273
def get_vectors(self, indices: list[int]) -> list[np.ndarray]:
    return [self.vec(self.index2word[ind])  # type: ignore
            for ind in indices if ind in self.vec_index2word]

inc_or_add

inc_or_add(word: str, cnt: int = 1, vec: Optional[ndarray] = None) -> None

Add a word or increase its count.

Parameters:

  • word

    (str) –

    Word to be added

  • cnt

    (int, default: 1 ) –

    By how much should the count be increased, or to what should it be set if a new word. (Default value = 1)

  • vec

    (Optional[ndarray], default: None ) –

    Word vector (Default value = None)

Source code in medcat-v2/medcat/vocab.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def inc_or_add(self, word: str, cnt: int = 1,
               vec: Optional[np.ndarray] = None) -> None:
    """Add a word or increase its count.

    Args:
        word(str):
            Word to be added
        cnt(int):
            By how much should the count be increased, or to what
            should it be set if a new word. (Default value = 1)
        vec(Optional[np.ndarray]):
            Word vector (Default value = None)
    """
    if word not in self.vocab:
        self.add_word(word, cnt, vec)
    else:
        self.inc_wc(word, cnt)

inc_wc

inc_wc(word: str, cnt: int = 1) -> None

Incraese word count by cnt.

Parameters:

  • word

    (str) –

    For which word to increase the count

  • cnt

    (int, default: 1 ) –

    By how muhc to increase the count (Default value = 1)

Source code in medcat-v2/medcat/vocab.py
 98
 99
100
101
102
103
104
105
106
107
def inc_wc(self, word: str, cnt: int = 1) -> None:
    """Incraese word count by cnt.

    Args:
        word(str):
            For which word to increase the count
        cnt(int):
            By how muhc to increase the count (Default value = 1)
    """
    self.item(word)['count'] += cnt

init_cumsums

init_cumsums() -> None

Initialise cumulative sums.

This is in place of the unigram table. But similarly to it, this approach allows generating a list of indices that match the probabilistic distribution expected as per the word counts of each word.

Source code in medcat-v2/medcat/vocab.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def init_cumsums(self) -> None:
    """Initialise cumulative sums.

    This is in place of the unigram table. But similarly to it, this
    approach allows generating a list of indices that match the
    probabilistic distribution expected as per the word counts of each
    word.
    """
    raw_freqs = []
    index_list = []
    # index list maps the slot in which a word index
    # sits in vec_index2word to the actual index for said word
    # e.g:
    #    if we have words indexed 0, 1, and 2
    #    but only 0, and 2 have corresponding vectors
    #    then only 0 and 2 will occur in vec_index2word
    #    and while 0 will be in the 0th position (as expected)
    #    in the final probability list, 2 will be in 1st position
    #    so we need to mark that conversion down
    for word_index, word in self.vec_index2word.items():
        raw_freqs.append(self[word])
        index_list.append(word_index)

    freqs = np.array(raw_freqs) ** (3 / 4)
    freqs /= freqs.sum()

    self.cum_probs = np.cumsum(freqs)
    # the mapping from vector index order to word indices
    self._index_list = index_list

item

item(word: str) -> WordDescriptor
Source code in medcat-v2/medcat/vocab.py
284
285
def item(self, word: str) -> WordDescriptor:
    return self.vocab[word]

load classmethod

load(path: str) -> Vocab
Source code in medcat-v2/medcat/vocab.py
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
@classmethod
def load(cls, path: str) -> 'Vocab':
    if should_serialise_as_zip(path, 'auto'):
        vocab = deserialise_from_zip(path)
    elif os.path.isfile(path) and path.endswith('.dat'):
        if not avoid_legacy_conversion():
            from medcat.utils.legacy.convert_vocab import (
                get_vocab_from_old)
            doing_legacy_conversion_message(logger, 'Vocab', path)
            vocab = get_vocab_from_old(path)
        else:
            raise LegacyConversionDisabledError("Vocab")
    else:
        vocab = deserialise(path)
    if not isinstance(vocab, Vocab):
        raise ValueError(f"The path '{path}' is not a Vocab!")
    return vocab

remove_all_vectors

remove_all_vectors() -> None

Remove all stored vector representations.

Source code in medcat-v2/medcat/vocab.py
66
67
68
69
70
71
def remove_all_vectors(self) -> None:
    """Remove all stored vector representations."""
    self.vec_index2word = {}

    for word in self.vocab:
        self.vocab[word]['vector'] = None

remove_words_below_cnt

remove_words_below_cnt(cnt: int) -> None

Remove all words with frequency below cnt.

Parameters:

  • cnt

    (int) –

    Word count limit.

Source code in medcat-v2/medcat/vocab.py
73
74
75
76
77
78
79
80
81
82
83
84
85
def remove_words_below_cnt(self, cnt: int) -> None:
    """Remove all words with frequency below cnt.

    Args:
        cnt(int):
            Word count limit.
    """
    for word in list(self.vocab.keys()):
        if self.vocab[word]['count'] < cnt:
            del self.vocab[word]

    # Rebuild index2word and vec_index2word
    self._rebuild_index()

reset_counts

reset_counts(cnt: int = 1) -> None

Reset the count for all word to cnt.

Parameters:

  • cnt

    (int, default: 1 ) –

    New count for all words in the vocab. (Default value = 1)

Source code in medcat-v2/medcat/vocab.py
124
125
126
127
128
129
130
131
132
def reset_counts(self, cnt: int = 1) -> None:
    """Reset the count for all word to cnt.

    Args:
        cnt(int):
            New count for all words in the vocab. (Default value = 1)
    """
    for word in self.vocab.keys():
        self.vocab[word]['count'] = cnt

save

save(save_path: str, serialiser: Union[str, AvailableSerialisers] = dill, overwrite: bool = False, as_zip: Union[bool, Literal['auto']] = 'auto') -> None

Save Vocab at path.

Parameters:

  • save_path

    (str) –

    The path to save at.

  • serialiser

    (Union[str, AvailableSerialisers], default: dill ) –

    The serialiser. Defaults to AvailableSerialisers.dill.

  • overwrite

    (bool, default: False ) –

    Whether to allow overwriting existing files. Defaults to False.

  • as_zip

    (Union[bool, Literal['auto']], default: 'auto' ) –

    Whether to serialise the CDB as a zip.

Source code in medcat-v2/medcat/vocab.py
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
def save(self, save_path: str,
         serialiser: Union[
             str, AvailableSerialisers] = AvailableSerialisers.dill,
         overwrite: bool = False,
         as_zip: Union[bool, Literal['auto']] = 'auto',
         ) -> None:
    """Save Vocab at path.

    Args:
        save_path (str):
            The path to save at.
        serialiser (Union[ str, AvailableSerialisers], optional):
            The serialiser. Defaults to AvailableSerialisers.dill.
        overwrite (bool, optional):
            Whether to allow overwriting existing files. Defaults to False.
        as_zip (Union[bool, Literal['auto']]):
            Whether to serialise the CDB as a zip.
    """
    if should_serialise_as_zip(save_path, as_zip):
        serialise_as_zip(self, save_path, serialiser, overwrite=overwrite)
        return
    serialise(serialiser, self, save_path, overwrite=overwrite)

update_counts

update_counts(tokens: list[str]) -> None

Given a list of tokens update counts for words in the vocab.

Parameters:

  • tokens

    (list[str]) –

    Usually a large block of text split into tokens/words.

Source code in medcat-v2/medcat/vocab.py
134
135
136
137
138
139
140
141
142
143
def update_counts(self, tokens: list[str]) -> None:
    """Given a list of tokens update counts for words in the vocab.

    Args:
        tokens(list[str]):
            Usually a large block of text split into tokens/words.
    """
    for token in tokens:
        if token in self:
            self.inc_wc(token, 1)

vec

vec(word: str) -> Optional[ndarray]
Source code in medcat-v2/medcat/vocab.py
278
279
def vec(self, word: str) -> Optional[np.ndarray]:
    return self.vocab[word]['vector']