medcat.components.addons.relation_extraction.tokenizer

Classes:

BaseTokenizerWrapper –

Attributes:

logger –

logger `module-attribute`

logger = getLogger(__name__)

BaseTokenizerWrapper

BaseTokenizerWrapper(hf_tokenizers=None, max_seq_length: Optional[int] = None, add_special_tokens: Optional[bool] = False)

Bases: PreTrainedTokenizerFast

Methods:

get_pad_id –
get_size –
load –
save –
token_to_id –

Attributes:

hf_tokenizers –
max_seq_length –
name –

Source code in medcat-v2/medcat/components/addons/relation_extraction/tokenizer.py

def __init__(self, hf_tokenizers=None,
             max_seq_length: Optional[int] = None,
             add_special_tokens: Optional[bool] = False):
    self.hf_tokenizers = hf_tokenizers
    self.max_seq_length = max_seq_length
    self._add_special_tokens = add_special_tokens

hf_tokenizers `instance-attribute`

hf_tokenizers = hf_tokenizers

max_seq_length `instance-attribute`

max_seq_length = max_seq_length

name `class-attribute` `instance-attribute`

name = 'base_tokenizer_wrapper_rel'

get_pad_id

get_pad_id()

Source code in medcat-v2/medcat/components/addons/relation_extraction/tokenizer.py

def get_pad_id(self):
    return self.hf_tokenizers.pad_token_id

get_size

get_size()

Source code in medcat-v2/medcat/components/addons/relation_extraction/tokenizer.py

def get_size(self):
    return len(self.hf_tokenizers.vocab)

load `classmethod`

load(tokenizer_path: str, relcat_config: ConfigRelCAT, **kwargs) -> BaseTokenizerWrapper

Source code in medcat-v2/medcat/components/addons/relation_extraction/tokenizer.py

@classmethod
def load(cls, tokenizer_path: str, relcat_config: ConfigRelCAT, **kwargs
         ) -> "BaseTokenizerWrapper":

    tokenizer = BaseTokenizerWrapper()

    cnf_gen = relcat_config.general

    if os.path.exists(tokenizer_path):
        if "modern-bert" in cnf_gen.tokenizer_name:
            from medcat.components.addons.relation_extraction.modernbert.tokenizer import TokenizerWrapperModernBERT  # noqa
            tokenizer = TokenizerWrapperModernBERT.load(
                tokenizer_path, relcat_config=relcat_config, **kwargs)
        elif "bert" in cnf_gen.tokenizer_name:
            from medcat.components.addons.relation_extraction.bert.tokenizer import TokenizerWrapperBERT  # noqa
            tokenizer = TokenizerWrapperBERT.load(
                tokenizer_path, relcat_config=relcat_config, **kwargs)
        elif "llama" in cnf_gen.tokenizer_name:
            from medcat.components.addons.relation_extraction.llama.tokenizer import TokenizerWrapperLlama  # noqa
            tokenizer = TokenizerWrapperLlama.load(
                tokenizer_path, relcat_config=relcat_config, **kwargs)
        logger.info("Tokenizer loaded %s from: %s",
                    str(tokenizer.__class__.__name__), tokenizer_path)
    elif cnf_gen.model_name:
        logger.info("Attempted to load Tokenizer from path: %s,"
                    " but it doesn't exist, loading default toknizer from "
                    "model_name relcat_config.general.model_name: %s",
                    tokenizer_path, cnf_gen.model_name)
        from medcat.components.addons.relation_extraction.bert.tokenizer import TokenizerWrapperBERT  # noqa
        from medcat.components.addons.relation_extraction.ml_utils import create_tokenizer_pretrain  # noqa
        logger.info(
            "Addeding special tokens to tokenizer: %s %s",
            str(cnf_gen.tokenizer_relation_annotation_special_tokens_tags),
            str(cnf_gen.tokenizer_other_special_tokens))
        tokenizer = TokenizerWrapperBERT(
            BertTokenizerFast.from_pretrained(cnf_gen.model_name),
            add_special_tokens=True)
        tokenizer = create_tokenizer_pretrain(
            tokenizer, relcat_config=relcat_config)
    else:
        logger.info(
            "Attempted to load Tokenizer from path: %s, "
            "but it doesn't exist, loading default toknizer from "
            "model_name config.general.model_name:bert-base-uncased",
            tokenizer_path)
        from medcat.components.addons.relation_extraction.bert.tokenizer import TokenizerWrapperBERT  # noqa
        tokenizer = TokenizerWrapperBERT(
            BertTokenizerFast.from_pretrained(cnf_gen.model_name),
            max_seq_length=cnf_gen.max_seq_length,
            add_special_tokens=cnf_gen.tokenizer_special_tokens)
    return tokenizer

save

save(dir_path: str)

Source code in medcat-v2/medcat/components/addons/relation_extraction/tokenizer.py

def save(self, dir_path: str):
    path = os.path.join(dir_path, self.name)
    self.hf_tokenizers.save_pretrained(path)

token_to_id

token_to_id(token)

Source code in medcat-v2/medcat/components/addons/relation_extraction/tokenizer.py

def token_to_id(self, token):
    return self.hf_tokenizers.convert_tokens_to_ids(token)

medcat.components.addons.relation_extraction.tokenizer

logger module-attribute

BaseTokenizerWrapper

hf_tokenizers instance-attribute

max_seq_length instance-attribute

name class-attribute instance-attribute

get_pad_id

get_size

load classmethod

save

token_to_id

logger `module-attribute`

hf_tokenizers `instance-attribute`

max_seq_length `instance-attribute`

name `class-attribute` `instance-attribute`

load `classmethod`