Skip to content

medcat.components.addons.meta_cat.mctokenizers.tokenizers

Classes:

Functions:

Attributes:

FAKE_TOKENIZER_PATH module-attribute

FAKE_TOKENIZER_PATH = '#\n/fake-path-not-exist#/'

TokenizerWrapperBase

TokenizerWrapperBase(hf_tokenizer: Optional[Tokenizer] = None)

Bases: ABC

Methods:

Attributes:

Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/tokenizers.py
15
16
def __init__(self, hf_tokenizer: Optional[Tokenizer] = None) -> None:
    self.hf_tokenizers = hf_tokenizer

hf_tokenizers instance-attribute

hf_tokenizers = hf_tokenizer

name instance-attribute

name: str

ensure_tokenizer

ensure_tokenizer() -> Tokenizer
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/tokenizers.py
45
46
47
48
def ensure_tokenizer(self) -> Tokenizer:
    if self.hf_tokenizers is None:
        raise ValueError("The tokenizer is not loaded yet")
    return self.hf_tokenizers

get_pad_id abstractmethod

get_pad_id() -> Union[Optional[int], list[int]]
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/tokenizers.py
42
43
@abstractmethod
def get_pad_id(self) -> Union[Optional[int], list[int]]: ...

get_size abstractmethod

get_size() -> int
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/tokenizers.py
36
37
@abstractmethod
def get_size(self) -> int: ...

load abstractmethod classmethod

load(dir_path: str, model_variant: Optional[str] = '', **kwargs) -> Tokenizer
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/tokenizers.py
31
32
33
34
@classmethod
@abstractmethod
def load(cls, dir_path: str, model_variant: Optional[str] = '', **kwargs
         ) -> Tokenizer: ...

save abstractmethod

save(dir_path: str) -> None
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/tokenizers.py
28
29
@abstractmethod
def save(self, dir_path: str) -> None: ...

token_to_id abstractmethod

token_to_id(token: str) -> Union[int, list[int]]
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/tokenizers.py
39
40
@abstractmethod
def token_to_id(self, token: str) -> Union[int, list[int]]: ...

init_tokenizer

init_tokenizer(cnf: ConfigMetaCAT) -> Optional[TokenizerWrapperBase]
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/tokenizers.py
51
52
53
54
55
56
57
58
59
60
61
def init_tokenizer(cnf: ConfigMetaCAT) -> Optional[TokenizerWrapperBase]:
    tokenizer: Optional[TokenizerWrapperBase] = None
    if cnf.general.tokenizer_name == 'bbpe':
        from medcat.components.addons.meta_cat.mctokenizers.bpe_tokenizer import (  # noqa
            TokenizerWrapperBPE)
        tokenizer = TokenizerWrapperBPE.create_new()
    elif cnf.general.tokenizer_name == 'bert-tokenizer':
        from medcat.components.addons.meta_cat.mctokenizers.bert_tokenizer import (  # noqa
            TokenizerWrapperBERT)
        tokenizer = TokenizerWrapperBERT.create_new(cnf.model.model_variant)
    return tokenizer

load_tokenizer

load_tokenizer(config: ConfigMetaCAT, tokenizer_folder: str) -> Optional[TokenizerWrapperBase]
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/tokenizers.py
64
65
66
67
68
69
70
71
72
73
74
75
76
def load_tokenizer(config: ConfigMetaCAT, tokenizer_folder: str
                   ) -> Optional[TokenizerWrapperBase]:
    tokenizer: Optional[TokenizerWrapperBase] = None
    if config.general.tokenizer_name == 'bbpe':
        from medcat.components.addons.meta_cat.mctokenizers.bpe_tokenizer import (  # noqa
            TokenizerWrapperBPE)
        tokenizer = TokenizerWrapperBPE.load(tokenizer_folder)
    elif config.general.tokenizer_name == 'bert-tokenizer':
        from medcat.components.addons.meta_cat.mctokenizers.bert_tokenizer import (  # noqa
            TokenizerWrapperBERT)
        tokenizer = TokenizerWrapperBERT.load(
            tokenizer_folder, config.model.model_variant)
    return tokenizer