Skip to content

medcat.components.addons.relation_extraction.tokenizer

Classes:

Attributes:

logger module-attribute

logger = getLogger(__name__)

BaseTokenizerWrapper

BaseTokenizerWrapper(hf_tokenizers=None, max_seq_length: Optional[int] = None, add_special_tokens: Optional[bool] = False)

Bases: PreTrainedTokenizerFast

Methods:

Attributes:

Source code in medcat-v2/medcat/components/addons/relation_extraction/tokenizer.py
17
18
19
20
21
22
def __init__(self, hf_tokenizers=None,
             max_seq_length: Optional[int] = None,
             add_special_tokens: Optional[bool] = False):
    self.hf_tokenizers = hf_tokenizers
    self.max_seq_length = max_seq_length
    self._add_special_tokens = add_special_tokens

hf_tokenizers instance-attribute

hf_tokenizers = hf_tokenizers

max_seq_length instance-attribute

max_seq_length = max_seq_length

name class-attribute instance-attribute

name = 'base_tokenizer_wrapper_rel'

get_pad_id

get_pad_id()
Source code in medcat-v2/medcat/components/addons/relation_extraction/tokenizer.py
30
31
def get_pad_id(self):
    return self.hf_tokenizers.pad_token_id

get_size

get_size()
Source code in medcat-v2/medcat/components/addons/relation_extraction/tokenizer.py
24
25
def get_size(self):
    return len(self.hf_tokenizers.vocab)

load classmethod

load(tokenizer_path: str, relcat_config: ConfigRelCAT, **kwargs) -> BaseTokenizerWrapper
Source code in medcat-v2/medcat/components/addons/relation_extraction/tokenizer.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
@classmethod
def load(cls, tokenizer_path: str, relcat_config: ConfigRelCAT, **kwargs
         ) -> "BaseTokenizerWrapper":

    tokenizer = BaseTokenizerWrapper()

    cnf_gen = relcat_config.general

    if os.path.exists(tokenizer_path):
        if "modern-bert" in cnf_gen.tokenizer_name:
            from medcat.components.addons.relation_extraction.modernbert.tokenizer import TokenizerWrapperModernBERT  # noqa
            tokenizer = TokenizerWrapperModernBERT.load(
                tokenizer_path, relcat_config=relcat_config, **kwargs)
        elif "bert" in cnf_gen.tokenizer_name:
            from medcat.components.addons.relation_extraction.bert.tokenizer import TokenizerWrapperBERT  # noqa
            tokenizer = TokenizerWrapperBERT.load(
                tokenizer_path, relcat_config=relcat_config, **kwargs)
        elif "llama" in cnf_gen.tokenizer_name:
            from medcat.components.addons.relation_extraction.llama.tokenizer import TokenizerWrapperLlama  # noqa
            tokenizer = TokenizerWrapperLlama.load(
                tokenizer_path, relcat_config=relcat_config, **kwargs)
        logger.info("Tokenizer loaded %s from: %s",
                    str(tokenizer.__class__.__name__), tokenizer_path)
    elif cnf_gen.model_name:
        logger.info("Attempted to load Tokenizer from path: %s,"
                    " but it doesn't exist, loading default toknizer from "
                    "model_name relcat_config.general.model_name: %s",
                    tokenizer_path, cnf_gen.model_name)
        from medcat.components.addons.relation_extraction.bert.tokenizer import TokenizerWrapperBERT  # noqa
        from medcat.components.addons.relation_extraction.ml_utils import create_tokenizer_pretrain  # noqa
        logger.info(
            "Addeding special tokens to tokenizer: %s %s",
            str(cnf_gen.tokenizer_relation_annotation_special_tokens_tags),
            str(cnf_gen.tokenizer_other_special_tokens))
        tokenizer = TokenizerWrapperBERT(
            BertTokenizerFast.from_pretrained(cnf_gen.model_name),
            add_special_tokens=True)
        tokenizer = create_tokenizer_pretrain(
            tokenizer, relcat_config=relcat_config)
    else:
        logger.info(
            "Attempted to load Tokenizer from path: %s, "
            "but it doesn't exist, loading default toknizer from "
            "model_name config.general.model_name:bert-base-uncased",
            tokenizer_path)
        from medcat.components.addons.relation_extraction.bert.tokenizer import TokenizerWrapperBERT  # noqa
        tokenizer = TokenizerWrapperBERT(
            BertTokenizerFast.from_pretrained(cnf_gen.model_name),
            max_seq_length=cnf_gen.max_seq_length,
            add_special_tokens=cnf_gen.tokenizer_special_tokens)
    return tokenizer

save

save(dir_path: str)
Source code in medcat-v2/medcat/components/addons/relation_extraction/tokenizer.py
73
74
75
def save(self, dir_path: str):
    path = os.path.join(dir_path, self.name)
    self.hf_tokenizers.save_pretrained(path)

token_to_id

token_to_id(token)
Source code in medcat-v2/medcat/components/addons/relation_extraction/tokenizer.py
27
28
def token_to_id(self, token):
    return self.hf_tokenizers.convert_tokens_to_ids(token)