medcat.components.ner.dict_based_ner

Classes:

NER –

Attributes:

logger –

logger `module-attribute`

logger = getLogger(__name__)

NER

NER(tokenizer: BaseTokenizer, cdb: CDB)

Bases: AbstractEntityProvidingComponent

Methods:

create_new_component –
get_type –
predict_entities –

Detect candidates for concepts - linker will then be able

Attributes:

automaton –
cdb –
config –
name –
tokenizer –

Source code in medcat-v2/medcat/components/ner/dict_based_ner.py

def __init__(self, tokenizer: BaseTokenizer,
             cdb: CDB) -> None:
    super().__init__()
    self.tokenizer = tokenizer
    self.cdb = cdb
    self.config = self.cdb.config
    self.automaton = Automaton()
    self._rebuild_automaton()

automaton `instance-attribute`

automaton = Automaton()

cdb `instance-attribute`

cdb = cdb

config `instance-attribute`

config = config

name `class-attribute` `instance-attribute`

name = 'cat_dict_ner'

tokenizer `instance-attribute`

tokenizer = tokenizer

create_new_component `classmethod`

create_new_component(cnf: ComponentConfig, tokenizer: BaseTokenizer, cdb: CDB, vocab: Vocab, model_load_path: Optional[str]) -> NER

Source code in medcat-v2/medcat/components/ner/dict_based_ner.py

@classmethod
def create_new_component(
        cls, cnf: ComponentConfig, tokenizer: BaseTokenizer,
        cdb: CDB, vocab: Vocab, model_load_path: Optional[str]) -> 'NER':
    return cls(tokenizer, cdb)

get_type

get_type() -> CoreComponentType

Source code in medcat-v2/medcat/components/ner/dict_based_ner.py

def get_type(self) -> CoreComponentType:
    return CoreComponentType.ner

predict_entities

predict_entities(doc: MutableDocument, ents: list[MutableEntity] | None = None) -> list[MutableEntity]

Detect candidates for concepts - linker will then be able to do the rest. It adds entities to the doc.entities and each entity can have the entity.link_candidates - that the linker will resolve.

Parameters:

doc
(MutableDocument) –

Spacy document to be annotated with named entities.
ents
(list[MutableEntity] | None, default: None ) –

The entities given. This should be None.

Returns:

list[MutableEntity] –

list[MutableEntity]: The NER'ed entities.

Source code in medcat-v2/medcat/components/ner/dict_based_ner.py

def predict_entities(self, doc: MutableDocument,
                     ents: list[MutableEntity] | None = None
                     ) -> list[MutableEntity]:
    """Detect candidates for concepts - linker will then be able
    to do the rest. It adds `entities` to the doc.entities and each
    entity can have the entity.link_candidates - that the linker
    will resolve.

    Args:
        doc (MutableDocument):
            Spacy document to be annotated with named entities.
        ents (list[MutableEntity] | None):
            The entities given. This should be None.

    Returns:
        list[MutableEntity]:
            The NER'ed entities.
    """
    if ents is not None:
        ValueError(f"Unexpected entities sent to NER: {ents}")
    if self.cdb.has_changed_names:
        self.cdb._reset_subnames()
        self._rebuild_automaton()
    text = doc.base.text.lower()
    ner_ents: list[MutableEntity] = []
    for end_idx, raw_name in self.automaton.iter(text):
        start_idx = end_idx - len(raw_name) + 1
        cur_tokens = doc.get_tokens(start_idx, end_idx)
        if not isinstance(cur_tokens, list):
            # NOTE: this shouldn't really happen since
            #       there should be no entities defined
            #       before the NER step.
            #       But we will (at least for now) still handler this
            cur_tokens = list(cur_tokens)
        if not cur_tokens:
            # NOTE: the most likely reason for this is when matching
            #       a substring (e.g an abreviation in a longer word).
            #       In that case, no spacy tokens will match. But we
            #       don't really want to catch `mi` (for myocardial
            #       infarction) in "family".
            continue
        preprocessed_name = raw_name.replace(
            ' ', self.config.general.separator)
        ent = maybe_annotate_name(
            self.tokenizer, preprocessed_name, cur_tokens,
            doc, self.cdb, self.config, len(ner_ents))
        if ent:
            ner_ents.append(ent)
    return ner_ents

medcat.components.ner.dict_based_ner

logger `module-attribute`

NER

automaton `instance-attribute`

cdb `instance-attribute`

config `instance-attribute`

name `class-attribute` `instance-attribute`

tokenizer `instance-attribute`

create_new_component `classmethod`

get_type

predict_entities

`doc`

`ents`

medcat.components.ner.dict_based_ner

logger module-attribute

NER

automaton instance-attribute

cdb instance-attribute

config instance-attribute

name class-attribute instance-attribute

tokenizer instance-attribute

create_new_component classmethod

get_type

predict_entities

doc

ents

logger `module-attribute`

automaton `instance-attribute`

cdb `instance-attribute`

config `instance-attribute`

name `class-attribute` `instance-attribute`

tokenizer `instance-attribute`

create_new_component `classmethod`

`doc`

`ents`