Skip to content

medcat.components.addons.meta_cat.mctokenizers.bpe_tokenizer

Classes:

Attributes:

FAKE_TOKENIZER_PATH module-attribute

FAKE_TOKENIZER_PATH = '#\n/fake-path-not-exist#/'

TokenizerWrapperBPE

TokenizerWrapperBPE(hf_tokenizers: Optional[ByteLevelBPETokenizer] = None)

Bases: TokenizerWrapperBase

Wrapper around a huggingface tokenizer so that it works with the MetaCAT models.

Parameters:

  • tokenizers.ByteLevelBPETokenizer

    A huggingface BBPE tokenizer.

Methods:

Attributes:

Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/bpe_tokenizer.py
22
23
24
25
26
27
28
29
def __init__(self, hf_tokenizers: Optional[ByteLevelBPETokenizer] = None
             ) -> None:
    super().__init__(hf_tokenizers)

    if self.hf_tokenizers is not None:
        # For whatever reason added tokens do not persist with
        # this tokenizer, what to do
        self.hf_tokenizers.add_tokens(['<PAD>'])

name class-attribute instance-attribute

name = 'bbpe'

create_new classmethod

create_new()
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/bpe_tokenizer.py
 99
100
101
102
@classmethod
def create_new(cls):
    tokenizer = ByteLevelBPETokenizer()
    return cls(tokenizer)

get_pad_id

get_pad_id() -> Union[int, list[int]]
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/bpe_tokenizer.py
112
113
114
115
116
117
118
def get_pad_id(self) -> Union[int, list[int]]:
    pad = self.token_to_id('<PAD>')
    if pad is None:
        raise Exception(
            "No <PAD> token in the vocabulary of the tokenizer, "
            "please add it")
    return pad

get_size

get_size() -> int
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/bpe_tokenizer.py
104
105
106
def get_size(self) -> int:
    self.hf_tokenizers = self.ensure_tokenizer()
    return self.hf_tokenizers.get_vocab_size()

load classmethod

load(dir_path: str, model_variant: Optional[str] = '', **kwargs) -> TokenizerWrapperBPE
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/bpe_tokenizer.py
86
87
88
89
90
91
92
93
94
95
96
97
@classmethod
def load(cls, dir_path: str, model_variant: Optional[str] = '', **kwargs
         ) -> "TokenizerWrapperBPE":
    tokenizer = cls()
    vocab_file = os.path.join(dir_path, f'{tokenizer.name}-vocab.json')
    merges_file = os.path.join(dir_path, f'{tokenizer.name}-merges.txt')
    tokenizer.hf_tokenizers = ByteLevelBPETokenizer.from_file(
        vocab_filename=vocab_file, merges_filename=merges_file, **kwargs)
    # For whatever reason added tokens do not persist with this tokenizer,
    # so we added it at each load
    tokenizer.hf_tokenizers.add_tokens(['<PAD>'])
    return tokenizer

save

save(dir_path: str) -> None
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/bpe_tokenizer.py
78
79
80
81
82
83
84
def save(self, dir_path: str) -> None:
    self.hf_tokenizers = self.ensure_tokenizer()

    if self.hf_tokenizers is None:
        raise ValueError("The tokenizer is not loaded yet")

    self.hf_tokenizers.save_model(dir_path, prefix=self.name)

token_to_id

token_to_id(token: str) -> Union[int, list[int]]
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/bpe_tokenizer.py
108
109
110
def token_to_id(self, token: str) -> Union[int, list[int]]:
    self.hf_tokenizers = self.ensure_tokenizer()
    return self.hf_tokenizers.token_to_id(token)