Skip to content

medcat.cdb.cdb

Classes:

Attributes:

logger module-attribute

logger = getLogger(__name__)

CDB

CDB(config: Config)

Bases: AbstractSerialisable

Methods:

Attributes:

Source code in medcat-v2/medcat/cdb/cdb.py
29
30
31
32
33
34
35
36
37
38
def __init__(self, config: Config) -> None:
    self.config = config
    self.cui2info: dict[str, CUIInfo] = {}
    self.name2info: dict[str, NameInfo] = {}
    self.type_id2info: dict[str, TypeInfo] = {}
    self.token_counts: dict[str, int] = {}
    self.addl_info: dict[str, Any] = {}
    self._subnames: set[str] = set()
    self.is_dirty = False
    self.has_changed_names = False

addl_info instance-attribute

addl_info: dict[str, Any] = {}

config instance-attribute

config = config

cui2info instance-attribute

cui2info: dict[str, CUIInfo] = {}

has_changed_names instance-attribute

has_changed_names = False

is_dirty instance-attribute

is_dirty = False

name2info instance-attribute

name2info: dict[str, NameInfo] = {}

token_counts instance-attribute

token_counts: dict[str, int] = {}

type_id2info instance-attribute

type_id2info: dict[str, TypeInfo] = {}

add_names

add_names(cui: str, names: dict[str, NameDescriptor], name_status: str = AUTOMATIC, full_build: bool = False) -> None

Adds a name to an existing concept.

Parameters:

  • cui

    (str) –

    Concept ID or unique identifier in this database, all concepts that have the same CUI will be merged internally.

  • names

    (dict[str, NameDescriptor]) –

    Names for this concept, or the value that if found in free text can be linked to this concept. Names is an dict like: {name: {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...} Names should be generated by helper function 'medcat.preprocessing.cleaners.prepare_name'

  • name_status

    (str, default: AUTOMATIC ) –

    One of P, N, A. Defaults to 'A'.

  • full_build

    (bool, default: False ) –

    If True the dictionary self.addl_info will also be populated, contains a lot of extra information about concepts, but can be very memory consuming. This is not necessary for normal functioning of MedCAT (Default value False).

Source code in medcat-v2/medcat/cdb/cdb.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def add_names(self, cui: str, names: dict[str, NameDescriptor],
              name_status: str = ST.AUTOMATIC, full_build: bool = False
              ) -> None:
    """Adds a name to an existing concept.

    Args:
        cui (str):
            Concept ID or unique identifier in this database, all concepts
            that have the same CUI will be merged internally.
        names (dict[str, NameDescriptor]):
            Names for this concept, or the value that if found in free
            text can be linked to this concept. Names is an dict like:
            `{name: {'tokens': tokens, 'snames': snames,
                     'raw_name': raw_name}, ...}`
            Names should be generated by helper function
            'medcat.preprocessing.cleaners.prepare_name'
        name_status (str):
            One of `P`, `N`, `A`. Defaults to 'A'.
        full_build (bool):
            If True the dictionary self.addl_info will also be populated,
            contains a lot of extra information about concepts, but can be
            very memory consuming. This is not necessary for normal
            functioning of MedCAT (Default value `False`).
    """
    name_status = name_status.upper()
    if name_status not in ST.ALLOWED_STATUS:
        # Name status must be one of the three
        name_status = ST.AUTOMATIC

    self._add_concept(cui=cui, names=names, ontologies=set(),
                      name_status=name_status, type_ids=set(),
                      description='', full_build=full_build)

add_types

add_types(types: Iterable[tuple[str, str]]) -> None

Add type info to CDB.

Parameters:

Source code in medcat-v2/medcat/cdb/cdb.py
104
105
106
107
108
109
110
111
def add_types(self, types: Iterable[tuple[str, str]]) -> None:
    """Add type info to CDB.

    Args:
        types (Iterable[tuple[str, str]]): The raw type info.
    """
    for type_id, name in types:
        self.type_id2info[type_id] = TypeInfo(type_id, name)

filter_by_cui

filter_by_cui(cuis_to_keep: Collection[str]) -> None

Subset the core CDB fields (dictionaries/maps).

Note that this will potenitally keep a bit more CUIs then in cuis_to_keep. It will first find all names that link to the cuis_to_keep and then find all CUIs that link to those names and keep all of them.

This also will not remove any data from cdb.addl_info - as this field can contain data of unknown structure.

Parameters:

  • cuis_to_keep

    (Collection[str]) –

    CUIs that will be kept, the rest will be removed (not completely, look above).

Raises:

  • Exception

    If no snames and subsetting is not possible.

Source code in medcat-v2/medcat/cdb/cdb.py
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
def filter_by_cui(self, cuis_to_keep: Collection[str]) -> None:
    """Subset the core CDB fields (dictionaries/maps).

    Note that this will potenitally keep a bit more CUIs
    then in cuis_to_keep. It will first find all names that
    link to the cuis_to_keep and then find all CUIs that
    link to those names and keep all of them.

    This also will not remove any data from cdb.addl_info -
    as this field can contain data of unknown structure.

    Args:
        cuis_to_keep (Collection[str]):
            CUIs that will be kept, the rest will be removed
            (not completely, look above).

    Raises:
        Exception: If no snames and subsetting is not possible.
    """
    # First get all names/snames that should be kept based on this CUIs
    names_to_keep = set()
    snames_to_keep = set()
    for cui in cuis_to_keep:
        if cui not in self.cui2info:
            logger.warning(
                "While filtering for CUIs asked to keep CUI '%s'"
                "which is not a part of the existing CDB", cui)
            continue
        ci = self.cui2info[cui]
        names_to_keep.update(ci['names'])
        snames_to_keep.update(ci['subnames'])

    # Based on the names get also the indirect CUIs that have to be kept
    all_cuis_to_keep: set[str] = set()
    for name in names_to_keep:
        # NOTE: since this was based on the cui2info they
        #       should all have a name info
        ni = self.name2info[name]
        all_cuis_to_keep.update(ni['per_cui_status'].keys())

    new_cui2info: dict[str, CUIInfo] = {}
    new_name2info: dict[str, NameInfo] = {}

    # get kept
    for cui in all_cuis_to_keep:
        if cui not in self.cui2info:
            # NOTE: already warned above
            continue
        new_cui2info[cui] = self.cui2info[cui]

    for name in names_to_keep:
        # NOTE: should all be in name2info since got from cui2info
        new_name2info[name] = self.name2info[name]

    # set filtered dicts
    self.cui2info = new_cui2info
    self.name2info = new_name2info
    # redo all subnames
    self._reset_subnames()
    self.is_dirty = True

get_basic_info

get_basic_info() -> CDBInfo
Source code in medcat-v2/medcat/cdb/cdb.py
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
def get_basic_info(self) -> CDBInfo:
    cui2ct = self.get_cui2count_train()
    cuis_trained = len(cui2ct)
    examples_seen = sum(cui2ct.values())
    if cuis_trained:
        average_count_train = examples_seen / cuis_trained
    else:
        average_count_train = 0.5
    unsup_history = [
        history.model_dump(mode='json')
        for history in self.config.meta.unsup_trained
    ]
    sup_history = [
        history.model_dump(mode='json')
        for history in self.config.meta.sup_trained
    ]
    return {
        "Number of concepts": len(self.cui2info),
        "Number of names": len(self.name2info),
        "Number of concepts that received training": cuis_trained,
        "Number of seen training examples in total": examples_seen,
        "Average training examples per concept": average_count_train,
        "Unsupervised training history": unsup_history,
        "Supervised training history": sup_history,
    }

get_cui2count_train

get_cui2count_train() -> dict[str, int]
Source code in medcat-v2/medcat/cdb/cdb.py
451
452
453
454
455
def get_cui2count_train(self) -> dict[str, int]:
    return {
        cui: ct for cui, ci in self.cui2info.items()
        if (ct := ci['count_train'])
    }

get_hash

get_hash() -> str
Source code in medcat-v2/medcat/cdb/cdb.py
463
464
465
466
467
468
469
470
471
472
def get_hash(self) -> str:
    hasher = Hasher()
    # only length for number of cuis/names/subnames
    hasher.update(len(self.cui2info))
    hasher.update(len(self.name2info))
    hasher.update(len(self._subnames))
    # the entirety of trained stuff
    hasher.update(self.get_cui2count_train())
    hasher.update(self.get_name2count_train())
    return hasher.hexdigest()

get_init_attrs classmethod

get_init_attrs() -> list[str]
Source code in medcat-v2/medcat/cdb/cdb.py
40
41
42
@classmethod
def get_init_attrs(cls) -> list[str]:
    return ['config']

get_name

get_name(cui: str) -> str

Returns preferred name if it exists, otherwise it will return the longest name assigned to the concept.

Parameters:

  • cui

    (str) –

    Concept ID or unique identifier in this database.

Returns:

  • str ( str ) –

    The name of the concept.

Source code in medcat-v2/medcat/cdb/cdb.py
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def get_name(self, cui: str) -> str:
    """Returns preferred name if it exists, otherwise it will return
    the longest name assigned to the concept.

    Args:
        cui (str):
            Concept ID or unique identifier in this database.

    Returns:
        str: The name of the concept.
    """
    name = cui
    sep = self.config.general.separator

    if cui not in self.cui2info:
        return name
    cui_info = self.cui2info[cui]
    pref_name = cui_info['preferred_name']
    names = cui_info['names']
    if pref_name:
        name = pref_name
    elif names:
        # longest name
        raw_name = max(names, key=len)
        name = " ".join(str(raw_name).split(sep)).title()

    return name

get_name2count_train

get_name2count_train() -> dict[str, int]
Source code in medcat-v2/medcat/cdb/cdb.py
457
458
459
460
461
def get_name2count_train(self) -> dict[str, int]:
    return {
        cui: ct for cui, ni in self.name2info.items()
        if (ct := ni['count_train'])
    }

has_subname

has_subname(name: str) -> bool

Whether the CDB has the specified subname.

Parameters:

  • name

    (str) –

    The subname to check.

Returns:

  • bool ( bool ) –

    Whether the subname is present in this CDB.

Source code in medcat-v2/medcat/cdb/cdb.py
51
52
53
54
55
56
57
58
59
60
61
62
63
def has_subname(self, name: str) -> bool:
    """Whether the CDB has the specified subname.

    Args:
        name (str): The subname to check.

    Returns:
        bool: Whether the subname is present in this CDB.
    """
    if (self.has_changed_names or
            len(self._subnames) < len(self.name2info)):
        self._reset_subnames()
    return name in self._subnames

load classmethod

load(path: str, perform_fixes: bool = True) -> CDB

Load the CDB off disk.

This can load a legacy (v1) CDB (.dat) or a v2 CDB either in its folder format or the .zip format. The distinction is made automatically.

Parameters:

  • path

    (str) –

    The path to the CDB.

  • perform_fixes

    (bool, default: True ) –

    Whether to perform fixes such as original names issue. Defaults to True.

Raises:

Returns:

  • CDB ( CDB ) –

    The loaded CDB.

Source code in medcat-v2/medcat/cdb/cdb.py
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
@classmethod
def load(cls, path: str, perform_fixes: bool = True) -> 'CDB':
    """Load the CDB off disk.

    This can load a legacy (v1) CDB (.dat) or a v2 CDB either in its folder
    format or the .zip format. The distinction is made automatically.

    Args:
        path (str): The path to the CDB.
        perform_fixes (bool): Whether to perform fixes such as
            original names issue. Defaults to True.

    Raises:
        LegacyConversionDisabledError:
            If when a legacy model is found and conversion is not allowed.
        ValueError: If the loaded object isn't a CDB.

    Returns:
        CDB: The loaded CDB.
    """
    if should_serialise_as_zip(path, 'auto'):
        cdb = deserialise_from_zip(path)
    elif os.path.isfile(path) and path.endswith('.dat'):
        if not avoid_legacy_conversion():
            from medcat.utils.legacy.convert_cdb import get_cdb_from_old
            doing_legacy_conversion_message(logger, 'CDB', path)
            cdb = get_cdb_from_old(path)
        else:
            raise LegacyConversionDisabledError("CDB")
    else:
        cdb = deserialise(path)
    if not isinstance(cdb, CDB):
        raise ValueError(f"The path '{path}' is not a CDB!")
    if perform_fixes:
        # perform fix(es)
        from medcat.utils.legacy.fixes import (
            fix_cui2original_names_if_needed)
        fix_cui2original_names_if_needed(cdb)
    return cdb

remove_cui

remove_cui(cui: str) -> None

This function takes a CUI and removes it the CDB.

It also removes the CUI from name specific per_cui_status maps as well as well as removes all the names that do not correspond to any CUIs after the removal of this one.

Parameters:

  • cui

    (str) –

    The CUI to remove.

Source code in medcat-v2/medcat/cdb/cdb.py
377
378
379
380
381
382
383
384
385
386
387
388
389
390
def remove_cui(self, cui: str) -> None:
    """This function takes a CUI and removes it the CDB.

    It also removes the CUI from name specific per_cui_status
    maps as well as well as removes all the names that do not
    correspond to any CUIs after the removal of this one.

    Args:
        cui (str): The CUI to remove.
    """
    self._remove_cui(cui)
    # need to reset subnames
    self._reset_subnames()
    self.is_dirty = True

remove_cuis_bulk

remove_cuis_bulk(cuis: Sequence[str]) -> None
Source code in medcat-v2/medcat/cdb/cdb.py
357
358
359
360
361
def remove_cuis_bulk(self, cuis: Sequence[str]) -> None:
    for cui in cuis:
        self._remove_cui(cui)
    # reset subnames once
    self._reset_subnames()

reset_training

reset_training() -> None

Will remove all training efforts - in other words all embeddings that are learnt for concepts in the current CDB. Please note that this does not remove synonyms (names) that were potentially added during supervised/online learning.

Source code in medcat-v2/medcat/cdb/cdb.py
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
def reset_training(self) -> None:
    """Will remove all training efforts - in other words all embeddings
    that are learnt for concepts in the current CDB. Please note that this
    does not remove synonyms (names) that were potentially added during
    supervised/online learning.
    """
    for cui_info in self.cui2info.values():
        reset_cui_training(cui_info)
    for name_info in self.name2info.values():
        name_info['count_train'] = 0
    self._subnames.clear()
    # clear config entries as well
    self.config.meta.unsup_trained.clear()
    self.config.meta.sup_trained.clear()
    self.is_dirty = True

save

save(save_path: str, serialiser: Union[str, AvailableSerialisers] = dill, overwrite: bool = False, as_zip: Union[bool, Literal['auto']] = 'auto') -> None

Save CDB at path.

Parameters:

  • save_path

    (str) –

    The path to save at.

  • serialiser

    (Union[str, AvailableSerialisers], default: dill ) –

    The serialiser. Defaults to AvailableSerialisers.dill.

  • overwrite

    (bool, default: False ) –

    Whether to allow overwriting existing files. Defaults to False.

  • as_zip

    (Union[bool, Literal['auto']], default: 'auto' ) –

    Whether to serialise the CDB as a zip.

Source code in medcat-v2/medcat/cdb/cdb.py
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
def save(self, save_path: str,
         serialiser: Union[
             str, AvailableSerialisers] = AvailableSerialisers.dill,
         overwrite: bool = False,
         as_zip: Union[bool, Literal['auto']] = 'auto',
         ) -> None:
    """Save CDB at path.

    Args:
        save_path (str):
            The path to save at.
        serialiser (Union[ str, AvailableSerialisers], optional):
            The serialiser. Defaults to AvailableSerialisers.dill.
        overwrite (bool, optional):
            Whether to allow overwriting existing files. Defaults to False.
        as_zip (Union[bool, Literal['auto']]):
            Whether to serialise the CDB as a zip.
    """
    if should_serialise_as_zip(save_path, as_zip):
        serialise_as_zip(self, save_path, serialiser, overwrite=overwrite)
        return
    serialise(serialiser, self, save_path, overwrite=overwrite)

weighted_average_function

weighted_average_function(step: int) -> float

Get the weighted average for steop.

Parameters:

  • step

    (int) –

    The steop.

Returns:

  • float ( float ) –

    The weighted average.

Source code in medcat-v2/medcat/cdb/cdb.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
def weighted_average_function(self, step: int) -> float:
    """Get the weighted average for steop.

    Args:
        step (int): The steop.

    Returns:
        float: The weighted average.
    """
    return default_weighted_average(step)