Skip to content

medcat.tokenizing.regex_impl.tokenizer

Classes:

Document

Document(text: str, tokens: Optional[list[MutableToken]] = None)

Methods:

Attributes:

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
222
223
224
225
226
227
def __init__(self, text: str, tokens: Optional[list[MutableToken]] = None
             ) -> None:
    self.text = text
    self._tokens = tokens or []
    self.ner_ents: list[MutableEntity] = []
    self.linked_ents: list[MutableEntity] = []

base property

linked_ents instance-attribute

linked_ents: list[MutableEntity] = []

ner_ents instance-attribute

ner_ents: list[MutableEntity] = []

text instance-attribute

text = text

get_addon_data

get_addon_data(path: str) -> Any
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
279
280
281
282
def get_addon_data(self, path: str) -> Any:
    if not hasattr(self.__class__, path):
        raise UnregisteredDataPathException(self.__class__, path)
    return getattr(self, path)

get_available_addon_paths

get_available_addon_paths() -> list[str]
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
284
285
286
def get_available_addon_paths(self) -> list[str]:
    return [path for path in self._addon_extension_paths
            if self.get_addon_data(path)]

get_tokens

get_tokens(start_index: int, end_index: int) -> list[MutableToken]
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
256
257
258
259
260
261
262
263
def get_tokens(self, start_index: int, end_index: int
               ) -> list[MutableToken]:
    tkns = []
    for tkn in self:
        if (tkn.base.char_index >= start_index and
                tkn.base.char_index <= end_index):
            tkns.append(tkn)
    return tkns

has_addon_data

has_addon_data(path: str) -> bool
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
276
277
def has_addon_data(self, path: str) -> bool:
    return bool(self.get_addon_data(path))

isupper

isupper() -> bool
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
268
269
def isupper(self) -> bool:
    return self.text.isupper()

register_addon_path classmethod

register_addon_path(path: str, def_val: Any = None, force: bool = True) -> None
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
288
289
290
291
292
@classmethod
def register_addon_path(cls, path: str, def_val: Any = None,
                        force: bool = True) -> None:
    setattr(cls, path, def_val)
    cls._addon_extension_paths.add(path)

set_addon_data

set_addon_data(path: str, val: Any) -> None
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
271
272
273
274
def set_addon_data(self, path: str, val: Any) -> None:
    if not hasattr(self.__class__, path):
        raise UnregisteredDataPathException(self.__class__, path)
    setattr(self, path, val)

Entity

Entity(document: Document, text: str, start_index: int, end_index: int, start_char_index: int, end_char_index: int)

Methods:

Attributes:

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def __init__(self, document: 'Document',
             text: str, start_index: int, end_index: int,
             start_char_index: int, end_char_index: int) -> None:
    self._doc = document
    self._text = text
    self._start_index = start_index
    self._end_index = end_index
    self._start_char_index = start_char_index
    self._end_char_index = end_char_index
    # defaults
    self.link_candidates: list[str] = []
    self.context_similarity: float = 0.0
    self.confidence: float = 0.0
    self.cui = ''
    self.id = -1  # TODO - what's the default?
    self.detected_name = ''

ENTITY_INFO_PREFIX class-attribute instance-attribute

ENTITY_INFO_PREFIX = 'Entity:'

base property

base: BaseEntity

confidence instance-attribute

confidence: float = 0.0

context_similarity instance-attribute

context_similarity: float = 0.0

cui instance-attribute

cui = ''

detected_name instance-attribute

detected_name = ''

end_char_index property

end_char_index: int

end_index property

end_index: int

id instance-attribute

id = -1

label property

label: int
link_candidates: list[str] = []

start_char_index property

start_char_index: int

start_index property

start_index: int

text property

text: str

get_addon_data

get_addon_data(path: str) -> Any
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
179
180
181
182
def get_addon_data(self, path: str) -> Any:
    # NOTE: doc.get_addon_data will raise if not registered
    doc_dict = self._doc.get_addon_data(f"{self.ENTITY_INFO_PREFIX}{path}")
    return doc_dict[(self.start_index, self.end_index)]

get_available_addon_paths

get_available_addon_paths() -> list[str]
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
184
185
186
def get_available_addon_paths(self) -> list[str]:
    return [path for path in self._addon_extension_paths
            if self.get_addon_data(path)]

has_addon_data

has_addon_data(path: str) -> bool
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
176
177
def has_addon_data(self, path: str) -> bool:
    return bool(self.get_addon_data(path))

register_addon_path classmethod

register_addon_path(path: str, def_val: Any = None, force: bool = True) -> None
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
188
189
190
191
192
193
194
195
196
197
198
@classmethod
def register_addon_path(cls, path: str, def_val: Any = None,
                        force: bool = True) -> None:
    # NOTE: registering for document since that should be constant
    # whereas the entities may be created and recreated
    # it'll map the entity start and end index to the value
    def_val_doc: dict = defaultdict(lambda: def_val)
    Document.register_addon_path(
        f"{cls.ENTITY_INFO_PREFIX}{path}", def_val=def_val_doc,
        force=force)
    cls._addon_extension_paths.add(path)

set_addon_data

set_addon_data(path: str, val: Any) -> None
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
171
172
173
174
def set_addon_data(self, path: str, val: Any) -> None:
    # NOTE: doc.get_addon_data will raise if not registered
    doc_dict = self._doc.get_addon_data(f"{self.ENTITY_INFO_PREFIX}{path}")
    doc_dict[(self.start_index, self.end_index)] = val

RegexTokenizer

Bases: BaseTokenizer

Methods:

Attributes:

REGEX class-attribute instance-attribute

REGEX = compile('(([^a-zA-Z0-9\\s]+|\\b\\w+\\b|\\S+)\\s?)')

create_entity

create_entity(doc: MutableDocument, token_start_index: int, token_end_index: int, label: str) -> MutableEntity
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
331
332
333
334
335
336
337
def create_entity(self, doc: MutableDocument,
                  token_start_index: int, token_end_index: int,
                  label: str) -> MutableEntity:
    rdoc = cast(Document, doc)
    return self.entity_from_tokens(
        # rdoc._tokens[token_start_index: token_end_index + 1])
        rdoc._tokens[token_start_index: token_end_index])

create_new_tokenizer classmethod

create_new_tokenizer(config: Config) -> RegexTokenizer
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
366
367
368
@classmethod
def create_new_tokenizer(cls, config: Config) -> 'RegexTokenizer':
    return cls()

entity_from_tokens

entity_from_tokens(tokens: list[MutableToken]) -> MutableEntity
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
342
343
344
345
346
347
348
def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
    if not tokens:
        raise ValueError("Need at least one token for an entity")
    doc = cast(Token, tokens[0])._doc
    start_index = doc._tokens.index(tokens[0])
    end_index = doc._tokens.index(tokens[-1])
    return _entity_from_tokens(doc, tokens, start_index, end_index)

get_doc_class

get_doc_class() -> Type[MutableDocument]
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
370
371
def get_doc_class(self) -> Type[MutableDocument]:
    return Document

get_entity_class

get_entity_class() -> Type[MutableEntity]
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
373
374
def get_entity_class(self) -> Type[MutableEntity]:
    return Entity

Token

Token(document: Document, text: str, _text_with_ws: str, start_index: int, token_index: int, is_punct: bool, to_skip: bool)

Attributes:

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def __init__(self, document: 'Document',
             text: str, _text_with_ws: str,
             start_index: int, token_index: int,
             is_punct: bool, to_skip: bool) -> None:
    self._doc = document
    self._text = text
    self._text_with_ws = _text_with_ws
    self._start_index = start_index
    self._token_index = token_index
    self._is_punct = is_punct
    self._to_skip = to_skip
    # defaults
    if self.norm is None:
        # force spacy to init ''
        self.norm = ''

base property

base: BaseToken

char_index property

char_index: int

index property

index: int

is_digit property

is_digit: bool

is_punctuation property writable

is_punctuation: bool

is_stop property

is_stop: bool

is_upper property

is_upper: bool

lemma property

lemma: str

lower property

lower: str

norm property writable

norm: str

tag property

tag: Optional[str]

text property

text: str

text_versions property

text_versions: list[str]

text_with_ws property

text_with_ws: str

to_skip property writable

to_skip: bool