Skip to content

medcat.components.addons.meta_cat.mctokenizers.bert_tokenizer

Classes:

Attributes:

FAKE_TOKENIZER_PATH module-attribute

FAKE_TOKENIZER_PATH = '#\n/fake-path-not-exist#/'

TokenizerWrapperBERT

TokenizerWrapperBERT(hf_tokenizers: Optional[BertTokenizerFast] = None)

Bases: TokenizerWrapperBase

Wrapper around a huggingface BERT tokenizer so that it works with the MetaCAT models.

Parameters:

  • transformers.models.bert.tokenization_bert_fast.BertTokenizerFast

    A huggingface Fast BERT.

Methods:

Attributes:

Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/bert_tokenizer.py
23
24
25
def __init__(self, hf_tokenizers: Optional[BertTokenizerFast] = None
             ) -> None:
    super().__init__(hf_tokenizers)

name class-attribute instance-attribute

name = 'bert-tokenizer'

create_new classmethod

create_new(model_variant: Optional[str]) -> TokenizerWrapperBERT
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/bert_tokenizer.py
85
86
87
88
@classmethod
def create_new(cls, model_variant: Optional[str]
               ) -> 'TokenizerWrapperBERT':
    return cls.load(FAKE_TOKENIZER_PATH, model_variant)

get_pad_id

get_pad_id() -> Optional[int]
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/bert_tokenizer.py
 98
 99
100
def get_pad_id(self) -> Optional[int]:
    self.hf_tokenizers = self.ensure_tokenizer()
    return self.hf_tokenizers.pad_token_id

get_size

get_size() -> int
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/bert_tokenizer.py
90
91
92
def get_size(self) -> int:
    self.hf_tokenizers = self.ensure_tokenizer()
    return len(self.hf_tokenizers.vocab)

load classmethod

load(dir_path: str, model_variant: Optional[str] = '', **kwargs) -> TokenizerWrapperBERT
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/bert_tokenizer.py
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
@classmethod
def load(cls, dir_path: str, model_variant: Optional[str] = '', **kwargs
         ) -> "TokenizerWrapperBERT":
    tokenizer = cls()
    if dir_path != FAKE_TOKENIZER_PATH:
        path = os.path.join(dir_path, cls.name)
        tokenizer.hf_tokenizers = BertTokenizerFast.from_pretrained(
            path, **kwargs)
    else:
        # NOTE: the variable is a string since it's called from meta_cat.py
        #       using a string (in 2 places), but the super class requires
        #       the argument here to be `Optional`.
        variant = str(model_variant)
        logging.warning("Could not load tokenizer (no path provided). "
                        f"Loading from library for model variant: "
                        f"{variant}")
        tokenizer.hf_tokenizers = BertTokenizerFast.from_pretrained(
            variant)
    return tokenizer

save

save(dir_path: str) -> None
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/bert_tokenizer.py
60
61
62
63
def save(self, dir_path: str) -> None:
    self.hf_tokenizers = self.ensure_tokenizer()
    path = os.path.join(dir_path, self.name)
    self.hf_tokenizers.save_pretrained(path)

token_to_id

token_to_id(token: str) -> Union[int, list[int]]
Source code in medcat-v2/medcat/components/addons/meta_cat/mctokenizers/bert_tokenizer.py
94
95
96
def token_to_id(self, token: str) -> Union[int, list[int]]:
    self.hf_tokenizers = self.ensure_tokenizer()
    return self.hf_tokenizers.convert_tokens_to_ids(token)