@classmethod
def load(cls, tokenizer_path: str, relcat_config: ConfigRelCAT, **kwargs
) -> "BaseTokenizerWrapper":
tokenizer = BaseTokenizerWrapper()
cnf_gen = relcat_config.general
if os.path.exists(tokenizer_path):
if "modern-bert" in cnf_gen.tokenizer_name:
from medcat.components.addons.relation_extraction.modernbert.tokenizer import TokenizerWrapperModernBERT # noqa
tokenizer = TokenizerWrapperModernBERT.load(
tokenizer_path, relcat_config=relcat_config, **kwargs)
elif "bert" in cnf_gen.tokenizer_name:
from medcat.components.addons.relation_extraction.bert.tokenizer import TokenizerWrapperBERT # noqa
tokenizer = TokenizerWrapperBERT.load(
tokenizer_path, relcat_config=relcat_config, **kwargs)
elif "llama" in cnf_gen.tokenizer_name:
from medcat.components.addons.relation_extraction.llama.tokenizer import TokenizerWrapperLlama # noqa
tokenizer = TokenizerWrapperLlama.load(
tokenizer_path, relcat_config=relcat_config, **kwargs)
logger.info("Tokenizer loaded %s from: %s",
str(tokenizer.__class__.__name__), tokenizer_path)
elif cnf_gen.model_name:
logger.info("Attempted to load Tokenizer from path: %s,"
" but it doesn't exist, loading default toknizer from "
"model_name relcat_config.general.model_name: %s",
tokenizer_path, cnf_gen.model_name)
from medcat.components.addons.relation_extraction.bert.tokenizer import TokenizerWrapperBERT # noqa
from medcat.components.addons.relation_extraction.ml_utils import create_tokenizer_pretrain # noqa
logger.info(
"Addeding special tokens to tokenizer: %s %s",
str(cnf_gen.tokenizer_relation_annotation_special_tokens_tags),
str(cnf_gen.tokenizer_other_special_tokens))
tokenizer = TokenizerWrapperBERT(
BertTokenizerFast.from_pretrained(cnf_gen.model_name),
add_special_tokens=True)
tokenizer = create_tokenizer_pretrain(
tokenizer, relcat_config=relcat_config)
else:
logger.info(
"Attempted to load Tokenizer from path: %s, "
"but it doesn't exist, loading default toknizer from "
"model_name config.general.model_name:bert-base-uncased",
tokenizer_path)
from medcat.components.addons.relation_extraction.bert.tokenizer import TokenizerWrapperBERT # noqa
tokenizer = TokenizerWrapperBERT(
BertTokenizerFast.from_pretrained(cnf_gen.model_name),
max_seq_length=cnf_gen.max_seq_length,
add_special_tokens=cnf_gen.tokenizer_special_tokens)
return tokenizer