medcat.components.ner.vocab_based_annotator

I would just ignore this whole class, it's just a lot of rules that work nicely for CDB once the software is trained the main thing are the context vectors.

Functions:

annotate_name –
maybe_annotate_name –

Given a name it will check should it be annotated based on config rules.

Attributes:

logger –

logger `module-attribute`

logger = getLogger(__name__)

annotate_name

annotate_name(tokenizer: BaseTokenizer, name: str, tkns: list[MutableToken], doc: MutableDocument, cdb: CDB, cur_id: int | None, label: str)

Source code in medcat-v2/medcat/components/ner/vocab_based_annotator.py

def annotate_name(tokenizer: BaseTokenizer, name: str,
                  tkns: list[MutableToken],
                  doc: MutableDocument, cdb: CDB,
                  cur_id: int | None,
                  label: str):
    entity: MutableEntity = tokenizer.create_entity(
        doc, tkns[0].base.index, tkns[-1].base.index + 1, label=label)
    # Only set this property when using a vocab approach
    # and where this name fits a name in the cdb.
    # All standard name entity recognition models will not set this.
    entity.detected_name = name
    entity.link_candidates = list(cdb.name2info[name]['per_cui_status'])

    if cur_id is None:
        logger.warning(
            "`medcat.components.ner.vocab_based_annotator.annotate_name` "
            "was called with no `cur_id`. This behaviour is not fully "
            "supported anymore.")
        start_index = entity.base.start_char_index
        span_len = len(name)
        cur_id = start_index * _START_INDEX_MULT + span_len
        # NOTE: These will be unique if the maximum length of each
        #       entity does not exceed _START_INDEX_MULT (1000)
        logger.warning(
            "Using the text start index %d (multiplied by %d) and adding "
            "the span length %d to get the id of %d", start_index,
            _START_INDEX_MULT, span_len, cur_id)
        logger.warning(
            "Setting MutableDocument.ner_ents during the method "
            "`medcat.components.ner.vocab_based_annotator.annotate_name` "
            "because the old API (without an ID) was used")
        doc.ner_ents.append(entity)  # TODO: remove this

    entity.id = cur_id
    entity.confidence = -1  # This does not calculate confidence

    # Not necessary, but why not
    logger.debug("NER detected an entity.\n\tDetected name: %s" +
                 "\n\tLink candidates: %s\n", entity.detected_name,
                 entity.link_candidates)
    return entity

maybe_annotate_name

maybe_annotate_name(tokenizer: BaseTokenizer, name: str, tkns: list[MutableToken], doc: MutableDocument, cdb: CDB, config: Config, cur_id: int | None = None, label: str = 'concept') -> Optional[MutableEntity]

Given a name it will check should it be annotated based on config rules. If yes the annotation will be added to the doc.entities array.

Parameters:

tokenizer
(BaseTokenizer) –

The tokenizer (probably SpaCy).
name
(str) –

The name found in the text of the document.
tkns
(list[MutableToken]) –

Tokens that belong to this name in the spacy document.
doc
(BaseDocument) –

Spacy document to be annotated with named entities.
cdb
(CDB) –

Concept database.
config
(Config) –

Global config for medcat.
cur_id
(int | None, default: None ) –

The potential ID for the entity. Defaults to None.
label
(str, default: 'concept' ) –

Label for this name (usually concept if we are using a vocab based approach).

Returns:

Optional[MutableEntity] –

Optional[BaseEntity]: The entity, if relevant.

Source code in medcat-v2/medcat/components/ner/vocab_based_annotator.py

def maybe_annotate_name(tokenizer: BaseTokenizer, name: str,
                        tkns: list[MutableToken],
                        doc: MutableDocument, cdb: CDB, config: Config,
                        cur_id: int | None = None,
                        label: str = 'concept'
                        ) -> Optional[MutableEntity]:
    """Given a name it will check should it be annotated based on config rules.
    If yes the annotation will be added to the doc.entities array.

    Args:
        tokenizer (BaseTokenizer):
            The tokenizer (probably SpaCy).
        name (str):
            The name found in the text of the document.
        tkns (list[MutableToken]):
            Tokens that belong to this name in the spacy document.
        doc (BaseDocument):
            Spacy document to be annotated with named entities.
        cdb (CDB):
            Concept database.
        config (Config):
            Global config for medcat.
        cur_id (int | None):
            The potential ID for the entity. Defaults to None.
        label (str):
            Label for this name (usually `concept` if we are using
            a vocab based approach).

    Returns:
        Optional[BaseEntity]: The entity, if relevant.
    """

    logger.debug("Maybe annotating name: %s", name)

    # Check uppercase to distinguish uppercase and lowercase
    # words that have a different meaning.
    if config.components.ner.check_upper_case_names:
        # Check whether name is completely uppercase in CDB.
        is_upper = (cdb.name2info[name]['is_upper']
                    if name in cdb.name2info else False)
        if is_upper:
            # Check whether tokens are also in uppercase. If tokens
            # are not in uppercase, there is a mismatch.
            if not all([x.base.is_upper for x in tkns]):
                return None

    if len(name) >= config.components.ner.min_name_len:
        # Check the upper case limit, last part checks if it is
        # one token and uppercase
        if (len(name) >= config.components.ner.upper_case_limit_len or
                (len(tkns) == 1 and tkns[0].base.is_upper)):
            # Everything is fine, mark name
            return annotate_name(
                tokenizer, name, tkns, doc, cdb, cur_id, label)

    return None

medcat.components.ner.vocab_based_annotator

logger `module-attribute`

annotate_name

maybe_annotate_name

`tokenizer`

`name`

`tkns`

`doc`

`cdb`

`config`

`cur_id`

`label`

medcat.components.ner.vocab_based_annotator

logger module-attribute

annotate_name

maybe_annotate_name

tokenizer

name

tkns

doc

cdb

config

cur_id

label

logger `module-attribute`

`tokenizer`

`name`

`tkns`

`doc`

`cdb`

`config`

`cur_id`

`label`