Skip to content

medcat.components.normalizing.normalizer

Classes:

Attributes:

CONTAINS_NUMBER module-attribute

CONTAINS_NUMBER = compile('[0-9]+')

BasicSpellChecker

BasicSpellChecker(cdb_vocab: dict[str, int], config: Config, data_vocab: Optional[Vocab] = None)

Methods:

  • P

    Probability of word.

  • candidates

    Generate possible spelling corrections for word.

  • edits1

    All edits that are one edit away from word.

  • edits2

    All edits that are two edits away from word.

  • edits3

    All edits that are two edits away from word.

  • fix

    Most probable spelling correction for word.

  • known

    The subset of words that appear in the dictionary of WORDS.

  • raw_edits1
  • raw_edits2

Attributes:

Source code in medcat-v2/medcat/components/normalizing/normalizer.py
17
18
19
20
21
def __init__(self, cdb_vocab: dict[str, int], config: Config,
             data_vocab: Optional[Vocab] = None):
    self.vocab = cdb_vocab
    self.config = config
    self.data_vocab = data_vocab

config instance-attribute

config = config

data_vocab instance-attribute

data_vocab = data_vocab

vocab instance-attribute

vocab = cdb_vocab

P

P(word: str) -> float

Probability of word.

Parameters:

  • word

    (str) –

    The word in question.

Returns:

  • float ( float ) –

    The probability.

Source code in medcat-v2/medcat/components/normalizing/normalizer.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def P(self, word: str) -> float:
    """Probability of `word`.

    Args:
        word (str): The word in question.

    Returns:
        float: The probability.
    """
# use inverse of rank as proxy
# returns 0 if the word isn't in the dictionary
    cnt = self.vocab.get(word, 0)
    if cnt != 0:
        return -1 / cnt
    else:
        return 0

candidates

candidates(word: str) -> Iterable[str]

Generate possible spelling corrections for word.

Parameters:

  • word

    (str) –

    The word.

Returns:

  • Iterable[str]

    Iterable[str]: The list of candidate words.

Source code in medcat-v2/medcat/components/normalizing/normalizer.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def candidates(self, word: str) -> Iterable[str]:
    """Generate possible spelling corrections for word.

    Args:
        word (str): The word.

    Returns:
        Iterable[str]: The list of candidate words.
    """
    if self.config.general.spell_check_deep:
        # This will check a two letter edit distance
        return (self.known([word]) or
                self.known(self.edits1(word)) or
                self.known(self.edits2(word)) or
                [word])
    else:
        # Will check only one letter edit distance
        return (self.known([word]) or
                self.known(self.edits1(word)) or
                [word])

edits1

edits1(word: str) -> set[str]

All edits that are one edit away from word.

Parameters:

  • word

    (str) –

    The word.

Returns:

  • set[str]

    set[str]: The set of all edits

Source code in medcat-v2/medcat/components/normalizing/normalizer.py
 95
 96
 97
 98
 99
100
101
102
103
104
def edits1(self, word: str) -> set[str]:
    """All edits that are one edit away from `word`.

    Args:
        word (str): The word.

    Returns:
        set[str]: The set of all edits
    """
    return self.raw_edits1(word, self.config.general.diacritics)

edits2

edits2(word: str) -> Iterator[str]

All edits that are two edits away from word.

Parameters:

  • word

    (str) –

    The word to start from.

Returns:

Source code in medcat-v2/medcat/components/normalizing/normalizer.py
150
151
152
153
154
155
156
157
158
159
def edits2(self, word: str) -> Iterator[str]:
    """All edits that are two edits away from `word`.

    Args:
        word (str): The word to start from.

    Returns:
        Iterator[str]: All 2-away edits.
    """
    return self.raw_edits2(word, self.config.general.diacritics)

edits3

edits3(word)

All edits that are two edits away from word.

Source code in medcat-v2/medcat/components/normalizing/normalizer.py
168
169
170
171
def edits3(self, word):
    """All edits that are two edits away from `word`."""  # noqa
    # Do d3 edits
    raise ValueError("No implementation")

fix

fix(word: str) -> Optional[str]

Most probable spelling correction for word.

Parameters:

  • word

    (str) –

    The word.

Returns:

  • Optional[str]

    Optional[str]: Fixed word, or None if no fixes were applied.

Source code in medcat-v2/medcat/components/normalizing/normalizer.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def fix(self, word: str) -> Optional[str]:
    """Most probable spelling correction for word.

    Args:
        word (str): The word.

    Returns:
        Optional[str]: Fixed word, or None if no fixes were applied.
    """
    fix = max(self.candidates(word), key=self.P)
    if fix != word:
        return fix
    else:
        return None

known

known(words: Iterable[str]) -> set[str]

The subset of words that appear in the dictionary of WORDS.

Parameters:

Returns:

  • set[str]

    set[str]: The set of candidates.

Source code in medcat-v2/medcat/components/normalizing/normalizer.py
84
85
86
87
88
89
90
91
92
93
def known(self, words: Iterable[str]) -> set[str]:
    """The subset of `words` that appear in the dictionary of WORDS.

    Args:
        words (Iterable[str]): The words.

    Returns:
        set[str]: The set of candidates.
    """
    return set(w for w in words if w in self.vocab)

raw_edits1 classmethod

raw_edits1(word: str, use_diacritics: bool = False, return_ordered: Literal[False] = False) -> set[str]
raw_edits1(word: str, use_diacritics: bool = False, return_ordered: Literal[True] = True) -> list[str]
raw_edits1(word: str, use_diacritics: bool = False, return_ordered: bool = False) -> Union[set[str], list[str]]
raw_edits1(word: str, use_diacritics: bool = False, return_ordered: bool = False) -> Union[set[str], list[str]]
Source code in medcat-v2/medcat/components/normalizing/normalizer.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
@classmethod
def raw_edits1(cls, word: str, use_diacritics: bool = False,
               return_ordered: bool = False) -> Union[set[str], list[str]]:
    letters = 'abcdefghijklmnopqrstuvwxyz'

    if use_diacritics:
        letters += 'àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'

    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes: list[str] = []
    transposes: list[str] = []
    replaces: list[str] = []
    inserts: list[str] = []
    for L, R in splits:
        if R:
            deletes.append(L + R[1:])
        if len(R) > 1:
            transposes.append(L + R[1] + R[0] + R[2:])
        if R:
            replaces.extend(L + c + R[1:] for c in letters)
        inserts.extend([L + c + R for c in letters])
    if not return_ordered:
        return set(deletes + transposes + replaces + inserts)
    else:
        return sorted(deletes + transposes + replaces + inserts)

raw_edits2 classmethod

raw_edits2(word: str, use_diacritics: bool = False, return_ordered: bool = False) -> Iterator[str]
Source code in medcat-v2/medcat/components/normalizing/normalizer.py
161
162
163
164
165
166
@classmethod
def raw_edits2(cls, word: str, use_diacritics: bool = False,
               return_ordered: bool = False) -> Iterator[str]:
    return (
        e2 for e1 in cls.raw_edits1(word, use_diacritics, return_ordered)
        for e2 in cls.raw_edits1(e1, use_diacritics, return_ordered))

TokenNormalizer

TokenNormalizer(nlp: BaseTokenizer, config: Config, cdb_vocab: dict[str, int], data_vocab: Optional[Vocab] = None)

Bases: AbstractCoreComponent

Will normalize all tokens in a spacy document.

Methods:

Attributes:

Source code in medcat-v2/medcat/components/normalizing/normalizer.py
179
180
181
182
183
184
def __init__(self, nlp: BaseTokenizer, config: Config,
             cdb_vocab: dict[str, int],
             data_vocab: Optional[Vocab] = None):
    self.config = config
    self.spell_checker = BasicSpellChecker(cdb_vocab, config, data_vocab)
    self.nlp = nlp

config instance-attribute

config = config

name class-attribute instance-attribute

name = 'token_normalizer'

nlp instance-attribute

nlp = nlp

spell_checker instance-attribute

spell_checker = BasicSpellChecker(cdb_vocab, config, data_vocab)

create_new_component classmethod

create_new_component(cnf: ComponentConfig, tokenizer: BaseTokenizer, cdb: CDB, vocab: Vocab, model_load_path: Optional[str]) -> TokenNormalizer
Source code in medcat-v2/medcat/components/normalizing/normalizer.py
224
225
226
227
228
229
@classmethod
def create_new_component(
        cls, cnf: ComponentConfig, tokenizer: BaseTokenizer,
        cdb: CDB, vocab: Vocab, model_load_path: Optional[str]
        ) -> 'TokenNormalizer':
    return cls(tokenizer, cdb.config, cdb.token_counts, vocab)

get_type

get_type() -> CoreComponentType
Source code in medcat-v2/medcat/components/normalizing/normalizer.py
186
187
def get_type(self) -> CoreComponentType:
    return CoreComponentType.token_normalizing