Skip to content

medcat.tokenizing.spacy_impl.tokenizers

Classes:

Functions:

Attributes:

logger module-attribute

logger = getLogger(__name__)

SpacyTokenizer

SpacyTokenizer(spacy_model_name: str, spacy_disabled_components: list[str], use_diacritics: bool, max_document_length: int, tokenizer_getter: Callable[[Language, bool], Tokenizer] = spacy_split_all, stopwords: Optional[set[str]] = None, avoid_pipe: bool = False)

Bases: BaseTokenizer

Methods:

Source code in medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def __init__(self, spacy_model_name: str,
             spacy_disabled_components: list[str],
             use_diacritics: bool,
             max_document_length: int,
             tokenizer_getter: Callable[[Language, bool], Tokenizer
                                        ] = spacy_split_all,
             stopwords: Optional[set[str]] = None,
             avoid_pipe: bool = False):
    self._spacy_model_name = os.path.basename(
        spacy_model_name).removeprefix(TOKENIZER_PREFIX)
    if self.load_internals_from(spacy_model_name):
        # i.e has something to load from path
        pass
    else:
        # no file provided, ensure the model is available
        ensure_spacy_model(self._spacy_model_name)
        spacy_model_name = self._spacy_model_name
    if stopwords is not None:
        lang_str = os.path.basename(spacy_model_name).removeprefix(
            TOKENIZER_PREFIX).split('_', 1)[0]
        cls = spacy.util.get_lang_class(lang_str)
        cls.Defaults.stop_words = set(stopwords)
    self._avoid_pipe = avoid_pipe
    self._nlp = spacy.load(spacy_model_name,
                           disable=spacy_disabled_components)
    self._nlp.tokenizer = tokenizer_getter(self._nlp, use_diacritics)
    self._nlp.max_length = max_document_length

create_entity

create_entity(doc: MutableDocument, token_start_index: int, token_end_index: int, label: str) -> MutableEntity
Source code in medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py
72
73
74
75
76
77
def create_entity(self, doc: MutableDocument,
                  token_start_index: int, token_end_index: int,
                  label: str) -> MutableEntity:
    spacy_doc = cast(Document, doc)._delegate
    span = Span(spacy_doc, token_start_index, token_end_index, label)
    return Entity(span)

create_new_tokenizer classmethod

create_new_tokenizer(config: Config) -> SpacyTokenizer
Source code in medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py
 94
 95
 96
 97
 98
 99
100
101
102
103
@classmethod
def create_new_tokenizer(cls, config: Config) -> 'SpacyTokenizer':
    nlp_cnf = config.general.nlp
    return cls(
        nlp_cnf.modelname,
        nlp_cnf.disabled_components,
        config.general.diacritics,
        config.preprocessing.max_document_length,
        stopwords=config.preprocessing.stopwords,
        avoid_pipe=config.general.nlp.faster_spacy_tokenization)

entity_from_tokens

entity_from_tokens(tokens: list[MutableToken]) -> MutableEntity
Source code in medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py
79
80
81
82
83
84
85
def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
    if not tokens:
        raise ValueError("Need at least one token for an entity")
    spacy_tokens = cast(list[Token], tokens)
    span = Span(spacy_tokens[0]._delegate.doc, spacy_tokens[0].index,
                spacy_tokens[-1].index + 1)
    return Entity(span)

get_doc_class

get_doc_class() -> Type[MutableDocument]
Source code in medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py
105
106
def get_doc_class(self) -> Type[MutableDocument]:
    return Document

get_entity_class

get_entity_class() -> Type[MutableEntity]
Source code in medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py
108
109
def get_entity_class(self) -> Type[MutableEntity]:
    return Entity

load_internals_from

load_internals_from(folder_path: str) -> bool
Source code in medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py
123
124
def load_internals_from(self, folder_path: str) -> bool:
    return os.path.exists(folder_path)

save_internals_to

save_internals_to(folder_path: str) -> str
Source code in medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py
113
114
115
116
117
118
119
120
121
def save_internals_to(self, folder_path: str) -> str:
    subfolder_only = f"{TOKENIZER_PREFIX}{self._spacy_model_name}"
    subfolder = os.path.join(folder_path, subfolder_only)
    if os.path.exists(subfolder):
        # NOTE: always overwrite
        shutil.rmtree(subfolder)
    logger.debug("Saving spacy model to '%s'", subfolder)
    self._nlp.to_disk(subfolder)
    return subfolder_only

spacy_split_all

spacy_split_all(nlp: Language, use_diacritics: bool) -> Tokenizer
Source code in medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def spacy_split_all(nlp: Language, use_diacritics: bool) -> Tokenizer:

    token_characters = r'[^A-Za-z0-9\@]'

    if use_diacritics:
        token_characters = r'[^A-Za-zÀ-ÖØ-öø-ÿ0-9\@]'

    infix_re = re.compile(token_characters)
    suffix_re = re.compile(token_characters + r'$')
    prefix_re = re.compile(r'^' + token_characters)
    return Tokenizer(nlp.vocab,
                     rules={},
                     token_match=None,
                     prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer
                     )