Skip to content

medcat.components.ner.vocab_based_ner

Classes:

Attributes:

logger module-attribute

logger = getLogger(__name__)

NER

NER(tokenizer: BaseTokenizer, cdb: CDB)

Bases: AbstractEntityProvidingComponent

Methods:

Attributes:

Source code in medcat-v2/medcat/components/ner/vocab_based_ner.py
20
21
22
23
24
25
def __init__(self, tokenizer: BaseTokenizer,
             cdb: CDB) -> None:
    super().__init__()
    self.tokenizer = tokenizer
    self.cdb = cdb
    self.config = self.cdb.config

cdb instance-attribute

cdb = cdb

config instance-attribute

config = config

name class-attribute instance-attribute

name = 'cat_ner'

tokenizer instance-attribute

tokenizer = tokenizer

create_new_component classmethod

create_new_component(cnf: ComponentConfig, tokenizer: BaseTokenizer, cdb: CDB, vocab: Vocab, model_load_path: Optional[str]) -> NER
Source code in medcat-v2/medcat/components/ner/vocab_based_ner.py
126
127
128
129
130
@classmethod
def create_new_component(
        cls, cnf: ComponentConfig, tokenizer: BaseTokenizer,
        cdb: CDB, vocab: Vocab, model_load_path: Optional[str]) -> 'NER':
    return cls(tokenizer, cdb)

get_type

get_type() -> CoreComponentType
Source code in medcat-v2/medcat/components/ner/vocab_based_ner.py
27
28
def get_type(self) -> CoreComponentType:
    return CoreComponentType.ner

predict_entities

predict_entities(doc: MutableDocument, ents: list[MutableEntity] | None = None) -> list[MutableEntity]

Detect candidates for concepts - linker will then be able to do the rest. It adds entities to the doc.entities and each entity can have the entity.link_candidates - that the linker will resolve.

Parameters:

  • doc

    (MutableDocument) –

    Spacy document to be annotated with named entities.

  • ents

    (list[MutableEntity] | None, default: None ) –

    The entities given. This should be None.

Returns:

Source code in medcat-v2/medcat/components/ner/vocab_based_ner.py
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def predict_entities(self, doc: MutableDocument,
                     ents: list[MutableEntity] | None = None
                     ) -> list[MutableEntity]:
    """Detect candidates for concepts - linker will then be able
    to do the rest. It adds `entities` to the doc.entities and each
    entity can have the entity.link_candidates - that the linker
    will resolve.

    Args:
        doc (MutableDocument):
            Spacy document to be annotated with named entities.
        ents (list[MutableEntity] | None):
            The entities given. This should be None.

    Returns:
        list[MutableEntity]:
            The NER'ed entities.
    """
    max_skip_tokens = self.config.components.ner.max_skip_tokens
    _sep = self.config.general.separator
    # Just take the tokens we need
    _doc = [tkn for tkn in doc if not tkn.to_skip]
    ner_ents: list[MutableEntity] = []
    for i, tkn in enumerate(_doc):
        tkn = _doc[i]
        tkns = [tkn]
        # name_versions = [tkn.lower_, tkn._.norm]
        # name_versions = [tkn.norm, tkn.base.lower]
        name_versions = tkn.base.text_versions
        name = ""

        for name_version in name_versions:
            if self.cdb.has_subname(name_version):
                if name:
                    name = name + _sep + name_version
                else:
                    name = name_version
                break
        # if name is in CDB
        if name in self.cdb.name2info and not tkn.base.is_stop:
            ent = maybe_annotate_name(
                self.tokenizer, name, tkns, doc,
                self.cdb, self.config, len(ner_ents))
            if ent:
                ner_ents.append(ent)
        # if name is not a subname CDB (explicitly)
        if not name:
            # There has to be at least something appended to the name
            # to go forward
            continue
        # if name is a part of a concept
        # we start adding onto it to get a match
        for j in range(i + 1, len(_doc)):
            if (_doc[j].base.index - _doc[j - 1].base.index - 1
                    > max_skip_tokens):
                # Do not allow to skip more than limit
                break
            tkn = _doc[j]
            tkns.append(tkn)
            # name_versions = [tkn.norm, tkn.base.lower]
            name_versions = tkn.base.text_versions

            name_changed = False
            name_reverse = None
            for name_version in name_versions:
                _name = name + _sep + name_version
                if self.cdb.has_subname(_name):
                    # Append the name and break
                    name = _name
                    name_changed = True
                    break

                if self.config.components.ner.try_reverse_word_order:
                    _name_reverse = name_version + _sep + name
                    if self.cdb.has_subname(_name_reverse):
                        # Append the name and break
                        name_reverse = _name_reverse

            if name_changed:
                if name in self.cdb.name2info:
                    ent = maybe_annotate_name(
                        self.tokenizer, name, tkns, doc,
                        self.cdb, self.config, len(ner_ents))
                    if ent:
                        ner_ents.append(ent)
            elif name_reverse is not None:
                if name_reverse in self.cdb.name2info:
                    ent = maybe_annotate_name(
                        self.tokenizer, name_reverse, tkns,
                        doc, self.cdb, self.config, len(ner_ents))
                    if ent:
                        ner_ents.append(ent)
            else:
                break
    return ner_ents