medcat.pipeline

Modules:

pipeline –

Classes:

Pipeline –

The pipeline for the NLP process.

Pipeline

Pipeline(cdb: CDB, vocab: Optional[Vocab], model_load_path: Optional[str], old_pipe: Optional[Pipeline] = None, addon_config_dict: Optional[dict[str, dict]] = None)

The pipeline for the NLP process.

This class is responsible to initial creation of the NLP document, as well as running through of all the components and addons.

Methods:

add_addon –
entity_from_tokens –

Get the entity from the list of tokens.
get_component –

Get the core component by the component type.
get_doc –

Get the document for this text.
iter_addons –
iter_all_components –
save_components –

Attributes:

cdb –
config –
tokenizer (BaseTokenizer) –

The raw tokenizer (with no components).
tokenizer_with_tag (BaseTokenizer) –

The tokenizer with the tagging component.
vocab (Vocab) –

Source code in medcat-v2/medcat/pipeline/pipeline.py

def __init__(self, cdb: CDB, vocab: Optional[Vocab],
             model_load_path: Optional[str],
             # NOTE: upon reload, old pipe can be useful
             old_pipe: Optional['Pipeline'] = None,
             addon_config_dict: Optional[dict[str, dict]] = None):
    self.cdb = cdb
    # NOTE: Vocab is None in case of DeID models and thats fine then,
    #       but it should be non-None otherwise
    self.vocab: Vocab = vocab  # type: ignore
    self.config = self.cdb.config
    self._tokenizer = self._init_tokenizer(model_load_path)
    self._components: list[CoreComponent] = []
    self._addons: list[AddonComponent] = []
    self._init_components(model_load_path, old_pipe, addon_config_dict)

cdb `instance-attribute`

cdb = cdb

config `instance-attribute`

config = config

tokenizer `property`

tokenizer: BaseTokenizer

The raw tokenizer (with no components).

tokenizer_with_tag `property`

tokenizer_with_tag: BaseTokenizer

The tokenizer with the tagging component.

vocab `instance-attribute`

vocab: Vocab = vocab

add_addon

add_addon(addon: AddonComponent) -> None

Source code in medcat-v2/medcat/pipeline/pipeline.py

def add_addon(self, addon: AddonComponent) -> None:
    self._addons.append(addon)
    # mark clean as of adding
    addon.config.mark_clean()

entity_from_tokens

entity_from_tokens(tokens: list[MutableToken]) -> MutableEntity

Get the entity from the list of tokens.

This effectively turns a list of (consecutive) documents into an entity.

Parameters:

tokens
(list[MutableToken]) –

The tokens to use.

Returns:

MutableEntity ( MutableEntity ) –

The resulting entity.

Source code in medcat-v2/medcat/pipeline/pipeline.py

def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
    """Get the entity from the list of tokens.

    This effectively turns a list of (consecutive) documents
    into an entity.

    Args:
        tokens (list[MutableToken]): The tokens to use.

    Returns:
        MutableEntity: The resulting entity.
    """
    return self._tokenizer.entity_from_tokens(tokens)

get_component

get_component(ctype: CoreComponentType) -> CoreComponent

Get the core component by the component type.

Parameters:

ctype
(CoreComponentType) –

The core component type.

Raises:

ValueError –

If no component by that type is found.

Returns:

CoreComponent ( CoreComponent ) –

The corresponding core component.

Source code in medcat-v2/medcat/pipeline/pipeline.py

def get_component(self, ctype: CoreComponentType) -> CoreComponent:
    """Get the core component by the component type.

    Args:
        ctype (CoreComponentType): The core component type.

    Raises:
        ValueError: If no component by that type is found.

    Returns:
        CoreComponent: The corresponding core component.
    """
    for comp in self._components:
        if not comp.is_core() or not isinstance(comp, CoreComponent):
            continue
        if comp.get_type() is ctype:
            return comp
    raise ValueError(f"No component found of type {ctype}")

get_doc

get_doc(text: str) -> MutableDocument

Get the document for this text.

This essentially runs the tokenizer over the text.

Parameters:

text
(str) –

The input text.

Returns:

MutableDocument ( MutableDocument ) –

The resulting document.

Source code in medcat-v2/medcat/pipeline/pipeline.py

def get_doc(self, text: str) -> MutableDocument:
    """Get the document for this text.

    This essentially runs the tokenizer over the text.

    Args:
        text (str): The input text.

    Returns:
        MutableDocument: The resulting document.
    """
    doc = self._tokenizer(text)
    for comp in self._components:
        logger.info("Running component %s for %d of text (%s)",
                    comp.full_name, len(text), id(text))
        doc = comp(doc)
    for addon in self._addons:
        doc = addon(doc)
    return doc

iter_addons

iter_addons() -> Iterable[AddonComponent]

Source code in medcat-v2/medcat/pipeline/pipeline.py

def iter_addons(self) -> Iterable[AddonComponent]:
    yield from self._addons

iter_all_components

iter_all_components() -> Iterable[BaseComponent]

Source code in medcat-v2/medcat/pipeline/pipeline.py

def iter_all_components(self) -> Iterable[BaseComponent]:
    for component in self._components:
        yield component
    for addon in self._addons:
        yield addon

save_components

save_components(serialiser_type: Union[AvailableSerialisers, str], components_folder: str) -> None

Source code in medcat-v2/medcat/pipeline/pipeline.py

def save_components(self,
                    serialiser_type: Union[AvailableSerialisers, str],
                    components_folder: str) -> None:
    for component in self.iter_all_components():
        if not isinstance(component, Serialisable):
            continue
        if not os.path.exists(components_folder):
            os.mkdir(components_folder)
        if isinstance(component, CoreComponent):
            comp_folder = os.path.join(
                components_folder,
                AbstractCoreComponent.NAME_PREFIX +
                component.get_type().name)
        elif isinstance(component, AddonComponent):
            comp_folder = os.path.join(
                components_folder,
                f"{AddonComponent.NAME_PREFIX}{component.addon_type}"
                f"{AddonComponent.NAME_SPLITTER}{component.name}")
        else:
            raise ValueError(
                f"Unknown component: {type(component)} - does not appear "
                "to be a CoreComponent or an AddonComponent")
        serialise(serialiser_type, component, comp_folder)

medcat.pipeline

Pipeline

cdb `instance-attribute`

config `instance-attribute`

tokenizer `property`

tokenizer_with_tag `property`

vocab `instance-attribute`

add_addon

entity_from_tokens

`tokens`

get_component

`ctype`

get_doc

`text`

iter_addons

iter_all_components

save_components

medcat.pipeline

Pipeline

cdb instance-attribute

config instance-attribute

tokenizer property

tokenizer_with_tag property

vocab instance-attribute

add_addon

entity_from_tokens

tokens

get_component

ctype

get_doc

text

iter_addons

iter_all_components

save_components

cdb `instance-attribute`

config `instance-attribute`

tokenizer `property`

tokenizer_with_tag `property`

vocab `instance-attribute`

`tokens`

`ctype`

`text`