Skip to content

medcat.pipeline

Modules:

Classes:

  • Pipeline

    The pipeline for the NLP process.

Pipeline

Pipeline(cdb: CDB, vocab: Optional[Vocab], model_load_path: Optional[str], old_pipe: Optional[Pipeline] = None, addon_config_dict: Optional[dict[str, dict]] = None)

The pipeline for the NLP process.

This class is responsible to initial creation of the NLP document, as well as running through of all the components and addons.

Methods:

Attributes:

Source code in medcat-v2/medcat/pipeline/pipeline.py
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def __init__(self, cdb: CDB, vocab: Optional[Vocab],
             model_load_path: Optional[str],
             # NOTE: upon reload, old pipe can be useful
             old_pipe: Optional['Pipeline'] = None,
             addon_config_dict: Optional[dict[str, dict]] = None):
    self.cdb = cdb
    # NOTE: Vocab is None in case of DeID models and thats fine then,
    #       but it should be non-None otherwise
    self.vocab: Vocab = vocab  # type: ignore
    self.config = self.cdb.config
    self._tokenizer = self._init_tokenizer(model_load_path)
    self._components: list[CoreComponent] = []
    self._addons: list[AddonComponent] = []
    self._init_components(model_load_path, old_pipe, addon_config_dict)

cdb instance-attribute

cdb = cdb

config instance-attribute

config = config

tokenizer property

tokenizer: BaseTokenizer

The raw tokenizer (with no components).

tokenizer_with_tag property

tokenizer_with_tag: BaseTokenizer

The tokenizer with the tagging component.

vocab instance-attribute

vocab: Vocab = vocab

add_addon

add_addon(addon: AddonComponent) -> None
Source code in medcat-v2/medcat/pipeline/pipeline.py
364
365
366
367
def add_addon(self, addon: AddonComponent) -> None:
    self._addons.append(addon)
    # mark clean as of adding
    addon.config.mark_clean()

entity_from_tokens

entity_from_tokens(tokens: list[MutableToken]) -> MutableEntity

Get the entity from the list of tokens.

This effectively turns a list of (consecutive) documents into an entity.

Parameters:

Returns:

Source code in medcat-v2/medcat/pipeline/pipeline.py
331
332
333
334
335
336
337
338
339
340
341
342
343
def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
    """Get the entity from the list of tokens.

    This effectively turns a list of (consecutive) documents
    into an entity.

    Args:
        tokens (list[MutableToken]): The tokens to use.

    Returns:
        MutableEntity: The resulting entity.
    """
    return self._tokenizer.entity_from_tokens(tokens)

get_component

Get the core component by the component type.

Parameters:

Raises:

  • ValueError

    If no component by that type is found.

Returns:

  • CoreComponent ( CoreComponent ) –

    The corresponding core component.

Source code in medcat-v2/medcat/pipeline/pipeline.py
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
def get_component(self, ctype: CoreComponentType) -> CoreComponent:
    """Get the core component by the component type.

    Args:
        ctype (CoreComponentType): The core component type.

    Raises:
        ValueError: If no component by that type is found.

    Returns:
        CoreComponent: The corresponding core component.
    """
    for comp in self._components:
        if not comp.is_core() or not isinstance(comp, CoreComponent):
            continue
        if comp.get_type() is ctype:
            return comp
    raise ValueError(f"No component found of type {ctype}")

get_doc

get_doc(text: str) -> MutableDocument

Get the document for this text.

This essentially runs the tokenizer over the text.

Parameters:

  • text

    (str) –

    The input text.

Returns:

Source code in medcat-v2/medcat/pipeline/pipeline.py
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
def get_doc(self, text: str) -> MutableDocument:
    """Get the document for this text.

    This essentially runs the tokenizer over the text.

    Args:
        text (str): The input text.

    Returns:
        MutableDocument: The resulting document.
    """
    doc = self._tokenizer(text)
    for comp in self._components:
        logger.info("Running component %s for %d of text (%s)",
                    comp.full_name, len(text), id(text))
        doc = comp(doc)
    for addon in self._addons:
        doc = addon(doc)
    return doc

iter_addons

iter_addons() -> Iterable[AddonComponent]
Source code in medcat-v2/medcat/pipeline/pipeline.py
399
400
def iter_addons(self) -> Iterable[AddonComponent]:
    yield from self._addons

iter_all_components

iter_all_components() -> Iterable[BaseComponent]
Source code in medcat-v2/medcat/pipeline/pipeline.py
393
394
395
396
397
def iter_all_components(self) -> Iterable[BaseComponent]:
    for component in self._components:
        yield component
    for addon in self._addons:
        yield addon

save_components

save_components(serialiser_type: Union[AvailableSerialisers, str], components_folder: str) -> None
Source code in medcat-v2/medcat/pipeline/pipeline.py
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
def save_components(self,
                    serialiser_type: Union[AvailableSerialisers, str],
                    components_folder: str) -> None:
    for component in self.iter_all_components():
        if not isinstance(component, Serialisable):
            continue
        if not os.path.exists(components_folder):
            os.mkdir(components_folder)
        if isinstance(component, CoreComponent):
            comp_folder = os.path.join(
                components_folder,
                AbstractCoreComponent.NAME_PREFIX +
                component.get_type().name)
        elif isinstance(component, AddonComponent):
            comp_folder = os.path.join(
                components_folder,
                f"{AddonComponent.NAME_PREFIX}{component.addon_type}"
                f"{AddonComponent.NAME_SPLITTER}{component.name}")
        else:
            raise ValueError(
                f"Unknown component: {type(component)} - does not appear "
                "to be a CoreComponent or an AddonComponent")
        serialise(serialiser_type, component, comp_folder)