medcat.components.ner.vocab_based_ner

Classes:

NER –

Attributes:

logger –

logger `module-attribute`

logger = getLogger(__name__)

NER

NER(tokenizer: BaseTokenizer, cdb: CDB)

Bases: AbstractEntityProvidingComponent

Methods:

create_new_component –
get_type –
predict_entities –

Detect candidates for concepts - linker will then be able

Attributes:

cdb –
config –
name –
tokenizer –

Source code in medcat-v2/medcat/components/ner/vocab_based_ner.py

def __init__(self, tokenizer: BaseTokenizer,
             cdb: CDB) -> None:
    super().__init__()
    self.tokenizer = tokenizer
    self.cdb = cdb
    self.config = self.cdb.config

cdb `instance-attribute`

cdb = cdb

config `instance-attribute`

config = config

name `class-attribute` `instance-attribute`

name = 'cat_ner'

tokenizer `instance-attribute`

tokenizer = tokenizer

create_new_component `classmethod`

create_new_component(cnf: ComponentConfig, tokenizer: BaseTokenizer, cdb: CDB, vocab: Vocab, model_load_path: Optional[str]) -> NER

Source code in medcat-v2/medcat/components/ner/vocab_based_ner.py

@classmethod
def create_new_component(
        cls, cnf: ComponentConfig, tokenizer: BaseTokenizer,
        cdb: CDB, vocab: Vocab, model_load_path: Optional[str]) -> 'NER':
    return cls(tokenizer, cdb)

get_type

get_type() -> CoreComponentType

Source code in medcat-v2/medcat/components/ner/vocab_based_ner.py

def get_type(self) -> CoreComponentType:
    return CoreComponentType.ner

predict_entities

predict_entities(doc: MutableDocument, ents: list[MutableEntity] | None = None) -> list[MutableEntity]

Detect candidates for concepts - linker will then be able to do the rest. It adds entities to the doc.entities and each entity can have the entity.link_candidates - that the linker will resolve.

Parameters:

doc
(MutableDocument) –

Spacy document to be annotated with named entities.
ents
(list[MutableEntity] | None, default: None ) –

The entities given. This should be None.

Returns:

list[MutableEntity] –

list[MutableEntity]: The NER'ed entities.

Source code in medcat-v2/medcat/components/ner/vocab_based_ner.py

def predict_entities(self, doc: MutableDocument,
                     ents: list[MutableEntity] | None = None
                     ) -> list[MutableEntity]:
    """Detect candidates for concepts - linker will then be able
    to do the rest. It adds `entities` to the doc.entities and each
    entity can have the entity.link_candidates - that the linker
    will resolve.

    Args:
        doc (MutableDocument):
            Spacy document to be annotated with named entities.
        ents (list[MutableEntity] | None):
            The entities given. This should be None.

    Returns:
        list[MutableEntity]:
            The NER'ed entities.
    """
    max_skip_tokens = self.config.components.ner.max_skip_tokens
    _sep = self.config.general.separator
    # Just take the tokens we need
    _doc = [tkn for tkn in doc if not tkn.to_skip]
    ner_ents: list[MutableEntity] = []
    for i, tkn in enumerate(_doc):
        tkn = _doc[i]
        tkns = [tkn]
        # name_versions = [tkn.lower_, tkn._.norm]
        # name_versions = [tkn.norm, tkn.base.lower]
        name_versions = tkn.base.text_versions
        name = ""

        for name_version in name_versions:
            if self.cdb.has_subname(name_version):
                if name:
                    name = name + _sep + name_version
                else:
                    name = name_version
                break
        # if name is in CDB
        if name in self.cdb.name2info and not tkn.base.is_stop:
            ent = maybe_annotate_name(
                self.tokenizer, name, tkns, doc,
                self.cdb, self.config, len(ner_ents))
            if ent:
                ner_ents.append(ent)
        # if name is not a subname CDB (explicitly)
        if not name:
            # There has to be at least something appended to the name
            # to go forward
            continue
        # if name is a part of a concept
        # we start adding onto it to get a match
        for j in range(i + 1, len(_doc)):
            if (_doc[j].base.index - _doc[j - 1].base.index - 1
                    > max_skip_tokens):
                # Do not allow to skip more than limit
                break
            tkn = _doc[j]
            tkns.append(tkn)
            # name_versions = [tkn.norm, tkn.base.lower]
            name_versions = tkn.base.text_versions

            name_changed = False
            name_reverse = None
            for name_version in name_versions:
                _name = name + _sep + name_version
                if self.cdb.has_subname(_name):
                    # Append the name and break
                    name = _name
                    name_changed = True
                    break

                if self.config.components.ner.try_reverse_word_order:
                    _name_reverse = name_version + _sep + name
                    if self.cdb.has_subname(_name_reverse):
                        # Append the name and break
                        name_reverse = _name_reverse

            if name_changed:
                if name in self.cdb.name2info:
                    ent = maybe_annotate_name(
                        self.tokenizer, name, tkns, doc,
                        self.cdb, self.config, len(ner_ents))
                    if ent:
                        ner_ents.append(ent)
            elif name_reverse is not None:
                if name_reverse in self.cdb.name2info:
                    ent = maybe_annotate_name(
                        self.tokenizer, name_reverse, tkns,
                        doc, self.cdb, self.config, len(ner_ents))
                    if ent:
                        ner_ents.append(ent)
            else:
                break
    return ner_ents

medcat.components.ner.vocab_based_ner

logger `module-attribute`

NER

cdb `instance-attribute`

config `instance-attribute`

name `class-attribute` `instance-attribute`

tokenizer `instance-attribute`

create_new_component `classmethod`

get_type

predict_entities

`doc`

`ents`

medcat.components.ner.vocab_based_ner

logger module-attribute

NER

cdb instance-attribute

config instance-attribute

name class-attribute instance-attribute

tokenizer instance-attribute

create_new_component classmethod

get_type

predict_entities

doc

ents

logger `module-attribute`

cdb `instance-attribute`

config `instance-attribute`

name `class-attribute` `instance-attribute`

tokenizer `instance-attribute`

create_new_component `classmethod`

`doc`

`ents`