Skip to content

medcat.utils.regression.checking

Classes:

Functions:

Attributes:

UNKNOWN_METADATA module-attribute

UNKNOWN_METADATA = 'Unknown'

logger module-attribute

logger = getLogger(__name__)

MalformedRegressionCaseException

MalformedRegressionCaseException(*args: object)

Bases: ValueError

Source code in medcat-v2/medcat/utils/regression/checking.py
534
535
def __init__(self, *args: object) -> None:
    super().__init__(*args)

MetaData

Bases: BaseModel

The metadata for the regression suite.

This should define which ontology (e.g UMLS or SNOMED) as well as which version was used when generating the regression suite.

The metadata may contain further information as well, this may include the annotator(s) involved when converting from MCT export or other relevant data.

Methods:

Attributes:

extra class-attribute instance-attribute

extra: dict = {}

ontology instance-attribute

ontology: str

ontology_version instance-attribute

ontology_version: str

regr_suite_creation_date class-attribute instance-attribute

regr_suite_creation_date: str = Field(default_factory=lambda: str(now()))

from_modelcard classmethod

from_modelcard(model_card: dict) -> MetaData

Generate a MetaData object from a model card.

This involves reading ontology info and version from the model card.

It must be noted that the model card should be provided as a dict not a string.

Parameters:

  • model_card

    (dict) –

    The CAT modelcard

Returns:

  • MetaData ( MetaData ) –

    The resulting MetaData

Source code in medcat-v2/medcat/utils/regression/checking.py
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
@classmethod
def from_modelcard(cls, model_card: dict) -> 'MetaData':
    """Generate a MetaData object from a model card.

    This involves reading ontology info and version from the model card.

    It must be noted that the model card should be provided as a dict
    not a string.

    Args:
        model_card (dict): The CAT modelcard

    Returns:
        MetaData: The resulting MetaData
    """
    ontology, ont_version = get_ontology_and_version(model_card)
    return MetaData(
        ontology=ontology, ontology_version=ont_version, extra=model_card)

unknown classmethod

unknown() -> MetaData
Source code in medcat-v2/medcat/utils/regression/checking.py
318
319
320
321
322
@classmethod
def unknown(self) -> 'MetaData':
    return MetaData(
        ontology=UNKNOWN_METADATA, ontology_version=UNKNOWN_METADATA,
        extra={}, regr_suite_creation_date=UNKNOWN_METADATA)

RegressionCase

Bases: BaseModel

A regression case that has a name, defines options, filters and phrases.

Methods:

Attributes:

name instance-attribute

name: str

options instance-attribute

options: OptionSet

phrases instance-attribute

phrases: list[str]

report instance-attribute

check_specific_for_phrase

Checks whether the specific target along with the specified phrase is able to be identified using the specified model.

Parameters:

Raises:

Returns:

  • tuple[Finding, Optional[str]]

    tuple[Finding, Optional[str]]: The nature to which the target was (or wasn't) identified

Source code in medcat-v2/medcat/utils/regression/checking.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def check_specific_for_phrase(self, cat: CAT, target: FinalTarget,
                              translation: TranslationLayer
                              ) -> tuple[Finding, Optional[str]]:
    """Checks whether the specific target along with the specified phrase
    is able to be identified using the specified model.

    Args:
        cat (CAT): The model
        target (FinalTarget): The final target configuration
        translation (TranslationLayer): The translation layer

    Raises:
        MalformedRegressionCaseException:
            If there are too many placeholders in phrase.

    Returns:
        tuple[Finding, Optional[str]]:
            The nature to which the target was (or wasn't) identified
    """
    phrase, cui, name, placeholder = (
        target.final_phrase, target.cui, target.name, target.placeholder)
    nr_of_placeholders = phrase.count(placeholder)
    if nr_of_placeholders != 1:
        raise MalformedRegressionCaseException(
            f"Got {nr_of_placeholders} placeholders "
            f"({placeholder}) (expected 1) for phrase: " +
            phrase)
    ph_start = phrase.find(placeholder)
    res = cat.get_entities(
        phrase.replace(placeholder, name), only_cui=False)
    ents = res['entities']
    finding = Finding.determine(cui, ph_start, ph_start + len(name),
                                translation, ents)
    if finding is Finding.IDENTICAL:
        logger.debug(
            'Matched test case %s in phrase "%s"', (cui, name), phrase)
    else:
        found_cuis = [ents[nr]['cui'] for nr in ents]
        found_names = [ents[nr]['source_value'] for nr in ents]
        cuis_names = ', '.join([f'{fcui}|{fname}'
                                for fcui, fname
                                in zip(found_cuis, found_names)])
        logger.debug(
            'FAILED to (fully) match (%s) test case %s in phrase "%s", '
            'found the following CUIS/names: %s',
            finding, (cui, name), phrase, cuis_names)
    self.report.report(target, finding)
    return finding

estimate_num_of_diff_subcases

estimate_num_of_diff_subcases() -> int
Source code in medcat-v2/medcat/utils/regression/checking.py
82
83
def estimate_num_of_diff_subcases(self) -> int:
    return len(self.phrases) * self.options.estimate_num_of_subcases()

from_dict classmethod

from_dict(name: str, in_dict: dict) -> RegressionCase

Construct the regression case from a dict.

The expected structure: { 'targeting': { [ # the placeholder to be replaced 'placeholder': '[DIAGNOSIS]' 'cuis': ['cui1', 'cui2'] 'prefname-only': 'false', # optional ] }, 'phrases': ['phrase %s'] # possible multiple }

Parameters:

  • name

    (str) –

    The name of the case

  • in_dict

    (dict) –

    The dict describing the case

Raises:

  • ValueError

    If the input dict does not have the 'targeting' section

  • ValueError

    If there are no phrases defined

Returns:

  • RegressionCase ( RegressionCase ) –

    The constructed regression cases.

Source code in medcat-v2/medcat/utils/regression/checking.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
@classmethod
def from_dict(cls, name: str, in_dict: dict) -> 'RegressionCase':
    """Construct the regression case from a dict.

    The expected structure:
    {
        'targeting': {
            [
                # the placeholder to be replaced
                'placeholder': '[DIAGNOSIS]'
                'cuis': ['cui1', 'cui2']
                'prefname-only': 'false', # optional
            ]
        },
        'phrases': ['phrase %s'] # possible multiple
    }

    Args:
        name (str): The name of the case
        in_dict (dict): The dict describing the case

    Raises:
        ValueError: If the input dict does not have the 'targeting' section
        ValueError: If there are no phrases defined

    Returns:
        RegressionCase: The constructed regression cases.
    """
    # set up targeting
    if 'targeting' not in in_dict:
        raise ValueError('Input dict should define targeting')
    targeting_section = in_dict['targeting']
    # set up options
    options = OptionSet.from_dict(targeting_section)
    # all_cases: list['RegressionCase'] = []
    # for option in options:
    #     # set up test phrases
    if 'phrases' not in in_dict:
        raise ValueError('Input dict should defined phrases')
    phrases = in_dict['phrases']
    if not isinstance(phrases, list):
        phrases = [phrases]  # just one defined
    if not phrases:
        raise ValueError('Need at least one target phrase')
    return RegressionCase(name=name, options=options,
                          phrases=phrases,
                          report=ResultDescriptor(name=name))

get_distinct_cases

Gets the various distinct sub-case iterators.

The sub-cases are those that can be determine without the translation layer. However, the translation layer is included here since it streamlines the operation.

Parameters:

  • translation

    (TranslationLayer) –

    The translation layer.

  • edit_distance

    (tuple[int, int, int]) –

    The edit distance(s) to try.

  • use_diacritics

    (bool) –

    Whether to use diacritics for edit distance.

Yields:

  • Iterator[FinalTarget]

    Iterator[Iterator[FinalTarget]]: The iterator of iterators of different sub cases.

Source code in medcat-v2/medcat/utils/regression/checking.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def get_distinct_cases(self, translation: TranslationLayer,
                       edit_distance: tuple[int, int, int],
                       use_diacritics: bool
                       ) -> Iterator[Iterator[FinalTarget]]:
    """Gets the various distinct sub-case iterators.

    The sub-cases are those that can be determine without the translation
    layer. However, the translation layer is included here since it
    streamlines the operation.

    Args:
        translation (TranslationLayer): The translation layer.
        edit_distance (tuple[int, int, int]): The edit distance(s) to try.
        use_diacritics (bool): Whether to use diacritics for edit distance.

    Yields:
        Iterator[Iterator[FinalTarget]]:
            The iterator of iterators of different sub cases.
    """
    # for each phrase and for each placeholder based option
    for changer in self.options.get_preprocessors_and_targets(translation):
        for phrase in self.phrases:
            yield self._get_subcases(phrase, changer, translation,
                                     edit_distance, use_diacritics)

to_dict

to_dict() -> dict

Converts the RegressionCase to a dict for serialisation.

Returns:

  • dict ( dict ) –

    The dict representation

Source code in medcat-v2/medcat/utils/regression/checking.py
162
163
164
165
166
167
168
169
170
171
def to_dict(self) -> dict:
    """Converts the RegressionCase to a dict for serialisation.

    Returns:
        dict: The dict representation
    """
    d: dict[str, Any] = {'phrases': list(self.phrases)}
    targeting = self.options.to_dict()
    d['targeting'] = targeting
    return d

RegressionSuite

RegressionSuite(cases: list[RegressionCase], metadata: MetaData, name: str)

The regression checker. This is used to check a bunch of regression cases at once against a model.

Parameters:

  • cases

    (list[RegressionCase]) –

    The list of regression cases

  • metadata

    (MetaData) –

    The metadata for the regression suite

  • use_report

    (bool) –

    Whether or not to use the report functionality. Defaults to False.

Methods:

Attributes:

Source code in medcat-v2/medcat/utils/regression/checking.py
356
357
358
359
360
361
362
def __init__(self, cases: list[RegressionCase],
             metadata: MetaData, name: str) -> None:
    self.cases: list[RegressionCase] = cases
    self.report = MultiDescriptor(name=name)
    self.metadata = metadata
    for case in self.cases:
        self.report.parts.append(case.report)

cases instance-attribute

metadata instance-attribute

metadata = metadata

report instance-attribute

report = MultiDescriptor(name=name)

check_model

Checks model and generates a report

Parameters:

  • cat

    (CAT) –

    The model to check against

  • translation

    (TranslationLayer) –

    The translation layer

  • edit_distance

    (tuple[int, int, int], default: (0, 0, 0) ) –

    The edit distance of the names. Defaults to (0, 0, 0).

  • use_diacritics

    (bool, default: False ) –

    Whether to use diacritics for edit distance.

Returns:

Source code in medcat-v2/medcat/utils/regression/checking.py
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
def check_model(self, cat: CAT, translation: TranslationLayer,
                edit_distance: tuple[int, int, int] = (0, 0, 0),
                use_diacritics: bool = False,
                ) -> MultiDescriptor:
    """Checks model and generates a report

    Args:
        cat (CAT): The model to check against
        translation (TranslationLayer): The translation layer
        edit_distance (tuple[int, int, int]):
            The edit distance of the names. Defaults to (0, 0, 0).
        use_diacritics (bool): Whether to use diacritics for edit distance.

    Returns:
        MultiDescriptor: A report description
    """
    for regr_case, target in self.iter_subcases(
            translation, True, edit_distance, use_diacritics):
        # NOTE: the finding is reported in the per-case report
        regr_case.check_specific_for_phrase(cat, target, translation)
    return self.report

estimate_total_distinct_cases

estimate_total_distinct_cases() -> int
Source code in medcat-v2/medcat/utils/regression/checking.py
390
391
def estimate_total_distinct_cases(self) -> int:
    return sum(rc.estimate_num_of_diff_subcases() for rc in self.cases)

from_dict classmethod

from_dict(in_dict: dict, name: str) -> RegressionSuite

Construct a RegressionChecker from a dict.

Most of the parsing is handled in RegressionChecker.from_dict. This just assumes that each key in the dict is a name and each value describes a RegressionCase.

Parameters:

  • in_dict

    (dict) –

    The input dict.

  • name

    (str) –

    The name of the regression suite.

Returns:

Source code in medcat-v2/medcat/utils/regression/checking.py
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
@classmethod
def from_dict(cls, in_dict: dict, name: str) -> 'RegressionSuite':
    """Construct a RegressionChecker from a dict.

    Most of the parsing is handled in RegressionChecker.from_dict.
    This just assumes that each key in the dict is a name
    and each value describes a RegressionCase.

    Args:
        in_dict (dict): The input dict.
        name (str): The name of the regression suite.

    Returns:
        RegressionChecker: The built regression checker
    """
    cases: list[RegressionCase] = []
    for case_name, details in in_dict.items():
        if case_name == 'meta':
            continue  # ignore metadata
        add_case = RegressionCase.from_dict(case_name, details)
        cases.append(add_case)
    if 'meta' not in in_dict:
        logger.warn("Loading regression suite without any meta data")
        metadata = MetaData.unknown()
    else:
        metadata = MetaData.parse_obj(in_dict['meta'])
    return RegressionSuite(cases=cases, metadata=metadata, name=name)

from_mct_export classmethod

from_mct_export(file_name: str) -> RegressionSuite
Source code in medcat-v2/medcat/utils/regression/checking.py
523
524
525
526
527
528
529
@classmethod
def from_mct_export(cls, file_name: str) -> 'RegressionSuite':
    with open(file_name) as f:
        data = json.load(f)
    converted = MedCATTrainerExportConverter(data).convert()
    return RegressionSuite.from_dict(
        converted, name=os.path.basename(file_name))

from_yaml classmethod

from_yaml(file_name: str) -> RegressionSuite

Constructs a RegressionChcker from a YAML file.

The from_dict method is used for the construction from the dict.

Parameters:

  • file_name

    (str) –

    The file name

Returns:

  • RegressionChecker ( RegressionSuite ) –

    The constructed regression checker

Source code in medcat-v2/medcat/utils/regression/checking.py
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
@classmethod
def from_yaml(cls, file_name: str) -> 'RegressionSuite':
    """Constructs a RegressionChcker from a YAML file.

    The from_dict method is used for the construction from the dict.

    Args:
        file_name (str): The file name

    Returns:
        RegressionChecker: The constructed regression checker
    """
    with open(file_name) as f:
        data = yaml.safe_load(f)
    return RegressionSuite.from_dict(
        data, name=os.path.basename(file_name))

get_all_distinct_cases

Gets all the distinct cases for this regression suite.

While distinct cases can be determined without the translation layer, including it here simplifies the process.

Parameters:

  • translation

    (TranslationLayer) –

    The translation layer.

  • edit_distance

    (tuple[int, int, int]) –

    The edit distance(s) to try. Defaults to (0, 0, 0).

  • use_diacritics

    (bool) –

    Whether to use diacritics for edit distance.

Yields:

Source code in medcat-v2/medcat/utils/regression/checking.py
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
def get_all_distinct_cases(self, translation: TranslationLayer,
                           edit_distance: tuple[int, int, int],
                           use_diacritics: bool
                           ) -> Iterator[tuple[RegressionCase,
                                               Iterator[FinalTarget]]]:
    """Gets all the distinct cases for this regression suite.

    While distinct cases can be determined without the translation layer,
    including it here simplifies the process.

    Args:
        translation (TranslationLayer): The translation layer.
        edit_distance (tuple[int, int, int]): The edit distance(s) to try.
            Defaults to (0, 0, 0).
        use_diacritics (bool): Whether to use diacritics for edit distance.

    Yields:
        Iterator[tuple[RegressionCase, Iterator[FinalTarget]]]:
            The generator of the regression case along with
            its corresponding sub-cases.
    """
    for regr_case in self.cases:
        for subcase in regr_case.get_distinct_cases(
                translation, edit_distance, use_diacritics):
            yield regr_case, subcase

iter_subcases

Iterate over all the sub-cases.

Each sub-case present a unique target (phrase, concept, name) on the corresponding regression case.

Parameters:

  • translation

    (TranslationLayer) –

    The translation layer.

  • show_progress

    (bool, default: True ) –

    Whether to show progress. Defaults to True.

  • edit_distance

    (tuple[int, int, int], default: (0, 0, 0) ) –

    The edit distance(s) to try. Defaults to (0, 0, 0).

  • use_diacritics

    (bool, default: False ) –

    Whether to use diacritics for edit distance.

Yields:

  • tuple[RegressionCase, FinalTarget]

    Iterator[tuple[RegressionCase, FinalTarget]]: The generator of the regression case along with each of the final target sub-cases.

Source code in medcat-v2/medcat/utils/regression/checking.py
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
def iter_subcases(self, translation: TranslationLayer,
                  show_progress: bool = True,
                  edit_distance: tuple[int, int, int] = (0, 0, 0),
                  use_diacritics: bool = False,
                  ) -> Iterator[tuple[RegressionCase, FinalTarget]]:
    """Iterate over all the sub-cases.

    Each sub-case present a unique target (phrase, concept, name) on
    the corresponding regression case.

    Args:
        translation (TranslationLayer): The translation layer.
        show_progress (bool): Whether to show progress. Defaults to True.
        edit_distance (tuple[int, int, int]): The edit distance(s) to try.
            Defaults to (0, 0, 0).
        use_diacritics (bool): Whether to use diacritics for edit distance.

    Yields:
        Iterator[tuple[RegressionCase, FinalTarget]]: The generator of the
            regression case along with each of the final target sub-cases.
    """
    total = self.estimate_total_distinct_cases()
    for (regr_case, subcase) in tqdm.tqdm(
            self.get_all_distinct_cases(translation, edit_distance,
                                        use_diacritics),
            total=total, disable=not show_progress):
        for target in subcase:
            yield regr_case, target

to_dict

to_dict() -> dict

Converts the RegressionChecker to dict for serialisation.

Returns:

  • dict ( dict ) –

    The dict representation

Source code in medcat-v2/medcat/utils/regression/checking.py
450
451
452
453
454
455
456
457
458
459
460
461
462
def to_dict(self) -> dict:
    """Converts the RegressionChecker to dict for serialisation.

    Returns:
        dict: The dict representation
    """
    d = {}
    for case in self.cases:
        d[case.name] = case.to_dict()
    d['meta'] = self.metadata.model_dump()
    fix_np_float64(d['meta'])

    return d

to_yaml

to_yaml() -> str

Convert the RegressionChecker to YAML string.

Returns:

  • str ( str ) –

    The YAML representation

Source code in medcat-v2/medcat/utils/regression/checking.py
464
465
466
467
468
469
470
def to_yaml(self) -> str:
    """Convert the RegressionChecker to YAML string.

    Returns:
        str: The YAML representation
    """
    return yaml.safe_dump(self.to_dict())

fix_np_float64

fix_np_float64(d: dict) -> None

Fix numpy.float64 in dictionary for yaml saving purposes.

These types of objects are unable to be cleanly serialized using yaml. So we need to convert them to the corresponding floats.

The changes will be made within the dictionary itself as well as dictionaries within, recursively.

Parameters:

  • d

    (dict) –

    The input dict

Source code in medcat-v2/medcat/utils/regression/checking.py
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
def fix_np_float64(d: dict) -> None:
    """Fix numpy.float64 in dictionary for yaml saving purposes.

    These types of objects are unable to be cleanly serialized using yaml.
    So we need to convert them to the corresponding floats.

    The changes will be made within the dictionary itself
    as well as dictionaries within, recursively.

    Args:
        d (dict): The input dict
    """
    import numpy as np
    for k, v in d.items():
        if isinstance(v, np.float64):
            d[k] = float(v)
        if isinstance(v, dict):
            fix_np_float64(v)

get_ontology_and_version

get_ontology_and_version(model_card: dict) -> tuple[str, str]

Attempt to get ontology (and its version) from a model card dict.

If no ontology is found, 'Unknown' is returned. The version is always returned as the first source ontology. That is, unless the specified location does not exist in the model card, in which case 'Unknown' is returned.

The ontology is assumed to be described at

model_card['Source Ontology'][0] (or model_card['Source Ontology'] if it's a string instead of a list)

The ontology version is read from

model_card['Source Ontology'][0] (or model_card['Source Ontology'] if it's a string instead of a list)

Currently, only SNOMED-CT, UMLS and ICD are supported / found.

Parameters:

  • model_card

    (dict) –

    The input model card.

Returns:

  • tuple[str, str]

    tuple[str, str]: The ontology (if found) or 'Unknown'; and the version (if found) or 'Unknown'

Source code in medcat-v2/medcat/utils/regression/checking.py
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
def get_ontology_and_version(model_card: dict) -> tuple[str, str]:
    """Attempt to get ontology (and its version) from a model card dict.

    If no ontology is found, 'Unknown' is returned.
    The version is always returned as the first source ontology.
    That is, unless the specified location does not exist in the model card,
    in which case 'Unknown' is returned.

    The ontology is assumed to be described at:
        model_card['Source Ontology'][0] (or model_card['Source Ontology']
        if it's a string instead of a list)

    The ontology version is read from:
        model_card['Source Ontology'][0] (or model_card['Source Ontology']
        if it's a string instead of a list)

    Currently, only SNOMED-CT, UMLS and ICD are supported / found.

    Args:
        model_card (dict): The input model card.

    Returns:
        tuple[str, str]:
            The ontology (if found) or 'Unknown';
            and the version (if found) or 'Unknown'
    """
    try:
        ont_list = model_card['Source Ontology']
        if isinstance(ont_list, list):
            ont1 = ont_list[0]
        elif isinstance(ont_list, str):
            ont1 = ont_list
        else:
            raise KeyError(f"Unknown source ontology: {ont_list}")
    except KeyError as key_err:
        logger.warning(
            "Didn't find the expected source ontology from the model card!",
            exc_info=key_err)
        return UNKNOWN_METADATA, UNKNOWN_METADATA
    # find ontology
    if 'SNOMED' in ont1.upper():
        return 'SNOMED-CT', ont1
    elif 'UMLS' in ont1.upper():
        return 'UMLS', ont1
    elif 'ICD' in ont1.upper():
        return 'ICD', ont1
    else:
        return UNKNOWN_METADATA, ont1