Skip to content

medcat.components.ner.dict_based_ner

Classes:

Attributes:

logger module-attribute

logger = getLogger(__name__)

NER

NER(tokenizer: BaseTokenizer, cdb: CDB)

Bases: AbstractEntityProvidingComponent

Methods:

Attributes:

Source code in medcat-v2/medcat/components/ner/dict_based_ner.py
31
32
33
34
35
36
37
38
def __init__(self, tokenizer: BaseTokenizer,
             cdb: CDB) -> None:
    super().__init__()
    self.tokenizer = tokenizer
    self.cdb = cdb
    self.config = self.cdb.config
    self.automaton = Automaton()
    self._rebuild_automaton()

automaton instance-attribute

automaton = Automaton()

cdb instance-attribute

cdb = cdb

config instance-attribute

config = config

name class-attribute instance-attribute

name = 'cat_dict_ner'

tokenizer instance-attribute

tokenizer = tokenizer

create_new_component classmethod

create_new_component(cnf: ComponentConfig, tokenizer: BaseTokenizer, cdb: CDB, vocab: Vocab, model_load_path: Optional[str]) -> NER
Source code in medcat-v2/medcat/components/ner/dict_based_ner.py
115
116
117
118
119
@classmethod
def create_new_component(
        cls, cnf: ComponentConfig, tokenizer: BaseTokenizer,
        cdb: CDB, vocab: Vocab, model_load_path: Optional[str]) -> 'NER':
    return cls(tokenizer, cdb)

get_type

get_type() -> CoreComponentType
Source code in medcat-v2/medcat/components/ner/dict_based_ner.py
62
63
def get_type(self) -> CoreComponentType:
    return CoreComponentType.ner

predict_entities

predict_entities(doc: MutableDocument, ents: list[MutableEntity] | None = None) -> list[MutableEntity]

Detect candidates for concepts - linker will then be able to do the rest. It adds entities to the doc.entities and each entity can have the entity.link_candidates - that the linker will resolve.

Parameters:

  • doc

    (MutableDocument) –

    Spacy document to be annotated with named entities.

  • ents

    (list[MutableEntity] | None, default: None ) –

    The entities given. This should be None.

Returns:

Source code in medcat-v2/medcat/components/ner/dict_based_ner.py
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def predict_entities(self, doc: MutableDocument,
                     ents: list[MutableEntity] | None = None
                     ) -> list[MutableEntity]:
    """Detect candidates for concepts - linker will then be able
    to do the rest. It adds `entities` to the doc.entities and each
    entity can have the entity.link_candidates - that the linker
    will resolve.

    Args:
        doc (MutableDocument):
            Spacy document to be annotated with named entities.
        ents (list[MutableEntity] | None):
            The entities given. This should be None.

    Returns:
        list[MutableEntity]:
            The NER'ed entities.
    """
    if ents is not None:
        ValueError(f"Unexpected entities sent to NER: {ents}")
    if self.cdb.has_changed_names:
        self.cdb._reset_subnames()
        self._rebuild_automaton()
    text = doc.base.text.lower()
    ner_ents: list[MutableEntity] = []
    for end_idx, raw_name in self.automaton.iter(text):
        start_idx = end_idx - len(raw_name) + 1
        cur_tokens = doc.get_tokens(start_idx, end_idx)
        if not isinstance(cur_tokens, list):
            # NOTE: this shouldn't really happen since
            #       there should be no entities defined
            #       before the NER step.
            #       But we will (at least for now) still handler this
            cur_tokens = list(cur_tokens)
        if not cur_tokens:
            # NOTE: the most likely reason for this is when matching
            #       a substring (e.g an abreviation in a longer word).
            #       In that case, no spacy tokens will match. But we
            #       don't really want to catch `mi` (for myocardial
            #       infarction) in "family".
            continue
        preprocessed_name = raw_name.replace(
            ' ', self.config.general.separator)
        ent = maybe_annotate_name(
            self.tokenizer, preprocessed_name, cur_tokens,
            doc, self.cdb, self.config, len(ner_ents))
        if ent:
            ner_ents.append(ent)
    return ner_ents