Skip to content

medcat.model_creation.preprocess_snomed

Classes:

Functions:

Attributes:

PER_FILE_TYPE_PATHS module-attribute

PER_FILE_TYPE_PATHS = {concept: join('Snapshot', 'Terminology'), description: join('Snapshot', 'Terminology'), relationship: join('Snapshot', 'Terminology'), refset: join('Snapshot', 'Refset', 'Map')}

SNOMED_FOLDER_NAME_PATTERN module-attribute

SNOMED_FOLDER_NAME_PATTERN = compile('^SnomedCT_([A-Za-z0-9]+)_([A-Za-z0-9]+)_(\\d{8}T\\d{6}Z$)')

BundleDescriptor dataclass

BundleDescriptor(extensions: list[SupportedExtension], ignores: dict[RefSetFileType, list[SupportedExtension]] = dict())

Methods:

Attributes:

extensions instance-attribute

extensions: list[SupportedExtension]

ignores class-attribute instance-attribute

ignores: dict[RefSetFileType, list[SupportedExtension]] = field(default_factory=dict)

has_invalid

has_invalid(ext: SupportedExtension, file_types: tuple[RefSetFileType]) -> bool
Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
196
197
198
199
200
201
202
203
204
def has_invalid(self, ext: SupportedExtension,
                file_types: tuple[RefSetFileType]) -> bool:
    for ft in file_types:
        if ft not in self.ignores:
            continue
        exts2ignore = self.ignores[ft]
        if ext in exts2ignore:
            return True
    return False

ExtensionDescription dataclass

ExtensionDescription(exp_name_in_folder: str, exp_files: FileFormatDescriptor, exp_2nd_part_in_folder: Optional[str] = None)

Attributes:

exp_2nd_part_in_folder class-attribute instance-attribute

exp_2nd_part_in_folder: Optional[str] = None

exp_files instance-attribute

exp_name_in_folder instance-attribute

exp_name_in_folder: str

FileFormatDescriptor dataclass

FileFormatDescriptor(concept: str, description: str, relationship: str, refset: str, common_prefix: str = 'sct2_')

Methods:

Attributes:

common_prefix class-attribute instance-attribute

common_prefix: str = 'sct2_'

concept instance-attribute

concept: str

description instance-attribute

description: str

refset instance-attribute

refset: str

relationship instance-attribute

relationship: str

get_concept

get_concept() -> str
Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
105
106
def get_concept(self) -> str:
    return self.get_file_per_type(RefSetFileType.concept)

get_description

get_description() -> str
Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
108
109
def get_description(self) -> str:
    return self.get_file_per_type(RefSetFileType.description)

get_file_per_type

get_file_per_type(file_type: RefSetFileType) -> str
Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
 96
 97
 98
 99
100
def get_file_per_type(self, file_type: RefSetFileType) -> str:
    raw = self._get_raw(file_type)
    return (raw
            if file_type == RefSetFileType.refset else
            self.common_prefix + raw)

get_refset

get_refset() -> str
Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
114
115
def get_refset(self) -> str:
    return self.get_file_per_type(RefSetFileType.refset)

get_relationship

get_relationship() -> str
Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
111
112
def get_relationship(self) -> str:
    return self.get_file_per_type(RefSetFileType.relationship)

ignore_all classmethod

ignore_all() -> FileFormatDescriptor
Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
91
92
93
94
@classmethod
def ignore_all(cls) -> 'FileFormatDescriptor':
    return cls(concept=_IGNORE_TAG, description=_IGNORE_TAG,
               relationship=_IGNORE_TAG, refset=_IGNORE_TAG)

RefSetFileType

Bases: Enum

Attributes:

concept class-attribute instance-attribute

concept = auto()

description class-attribute instance-attribute

description = auto()

refset class-attribute instance-attribute

refset = auto()

relationship class-attribute instance-attribute

relationship = auto()

Snomed

Snomed(data_path)

Pre-process SNOMED CT release files.

This class is used to create a SNOMED CT concept DataFrame ready for MedCAT CDB creation.

Attributes:

  • data_path (str) –

    Path to the unzipped SNOMED CT folder.

  • release (str) –

    Release of SNOMED CT folder.

  • uk_ext (bool) –

    Specifies whether the version is a SNOMED UK extension released after 2021. Defaults to False.

  • uk_drug_ext (bool) –

    Specifies whether the version is a SNOMED UK drug extension. Defaults to False.

  • au_ext (bool) –

    Specifies whether the version is a AU release. Defaults to False.

Methods:

Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
265
266
267
268
269
def __init__(self, data_path):
    self.data_path = data_path
    self.bundle = self._determine_bundle(self.data_path)
    self.paths, self.snomed_releases, self.exts = (
        self._check_path_and_release())

NO_VERSION_DETECTED class-attribute instance-attribute

NO_VERSION_DETECTED = 'N/A'

bundle instance-attribute

bundle = _determine_bundle(data_path)

data_path instance-attribute

data_path = data_path

list_all_relationships

list_all_relationships()

List all SNOMED CT relationships.

SNOMED CT provides a rich set of inter-relationships between concepts.

Returns:

  • list

    List of all SNOMED CT relationships.

Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
def list_all_relationships(self):
    """
    List all SNOMED CT relationships.

    SNOMED CT provides a rich set of inter-relationships between concepts.

    Returns:
        list: List of all SNOMED CT relationships.
    """
    all_rela = []
    for i, snomed_release in enumerate(self.snomed_releases):
        self._set_extension(snomed_release, self.exts[i])
        contents_path = os.path.join(
            self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept])
        exp_files = self._extension.value.exp_files
        concept_snapshot = exp_files.get_concept()
        relationship_snapshot = exp_files.get_relationship()
        if concept_snapshot is None or _IGNORE_TAG in concept_snapshot or (
                self.bundle and self.bundle.value.has_invalid(
                    self._extension, [RefSetFileType.concept,
                                      RefSetFileType.description])):
            continue

        for f in os.listdir(contents_path):
            m = re.search(f'{concept_snapshot}' + r'_(.*)_\d*.txt', f)
            if m:
                snomed_v = m.group(1)
        int_relat = parse_file(
            f'{contents_path}/'
            f'{relationship_snapshot}_{snomed_v}_{snomed_release}.txt')
        active_relat = int_relat[int_relat.active == '1']
        del int_relat

        all_rela.extend(
            [relationship for
             relationship in active_relat["typeId"].unique()])
    return all_rela

map_snomed2icd10

map_snomed2icd10()

This function maps SNOMED CT concepts to ICD-10 codes using the refset mappings provided in the SNOMED CT release package.

Returns:

  • dict

    A dictionary containing the SNOMED CT to ICD-10 mappings including metadata.

Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
521
522
523
524
525
526
527
528
529
530
531
def map_snomed2icd10(self):
    """
    This function maps SNOMED CT concepts to ICD-10 codes using the refset
    mappings provided in the SNOMED CT release package.

    Returns:
        dict: A dictionary containing the SNOMED CT to ICD-10 mappings
            including metadata.
    """
    snomed2icd10df = self._map_snomed2refset()
    return self._refset_df2dict(snomed2icd10df[0])

map_snomed2opcs4

map_snomed2opcs4() -> dict

This function maps SNOMED CT concepts to OPCS-4 codes using the refset mappings provided in the SNOMED CT release package.

Then it calls the internal function _map_snomed2refset() to get the DataFrame containing the OPCS-4 mappings. The function then converts the DataFrame to a dictionary using the internal function _refset_df2dict()

Raises:

Returns:

  • dict ( dict ) –

    A dictionary containing the SNOMED CT to OPCS-4 mappings including metadata.

Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
def map_snomed2opcs4(self) -> dict:
    """
    This function maps SNOMED CT concepts to OPCS-4 codes using the refset
    mappings provided in the SNOMED CT release package.

    Then it calls the internal function _map_snomed2refset() to get the
        DataFrame containing the OPCS-4 mappings.
    The function then converts the DataFrame to a dictionary using the
        internal function _refset_df2dict()

    Raises:
        AttributeError: If OPCS-4 mappings aren't available.

    Returns:
        dict: A dictionary containing the SNOMED CT to OPCS-4 mappings
            including metadata.
    """
    if all(ext not in (SupportedExtension.UK_CLINICAL,
                       SupportedExtension.UK_DRUG)
           for ext in self.exts):
        raise AttributeError(
            "OPCS-4 mapping does not exist in this edition")
    snomed2opcs4df = self._map_snomed2refset()[1]
    return self._refset_df2dict(snomed2opcs4df)

relationship2json

relationship2json(relationshipcode, output_jsonfile)

Convert a single relationship map structure to JSON file.

Parameters:

  • relationshipcode

    (str) –

    A single SCTID or unique concept identifier of the relationship type.

  • output_jsonfile

    (str) –

    Name of JSON file output.

Returns:

  • file

    JSON file of relationship mapping.

Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
def relationship2json(self, relationshipcode, output_jsonfile):
    """
    Convert a single relationship map structure to JSON file.

    Args:
        relationshipcode (str): A single SCTID or unique concept identifier
            of the relationship type.
        output_jsonfile (str): Name of JSON file output.

    Returns:
        file: JSON file of relationship mapping.
    """
    output_dict = {}
    for i, snomed_release in enumerate(self.snomed_releases):
        self._set_extension(snomed_release, self.exts[i])
        contents_path = os.path.join(
            self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept])
        exp_files = self._extension.value.exp_files
        concept_snapshot = exp_files.get_concept()
        relationship_snapshot = exp_files.get_relationship()
        if concept_snapshot is None or _IGNORE_TAG in concept_snapshot or (
                self.bundle and self.bundle.value.has_invalid(
                    self._extension, [RefSetFileType.concept,
                                      RefSetFileType.description])):
            continue

        for f in os.listdir(contents_path):
            m = re.search(f'{concept_snapshot}' + r'_(.*)_\d*.txt', f)
            if m:
                snomed_v = m.group(1)
        int_relat = parse_file(
            f'{contents_path}/'
            f'{relationship_snapshot}_{snomed_v}_{snomed_release}.txt')
        active_relat = int_relat[int_relat.active == '1']
        del int_relat

        relationship = dict(
            [(key, []) for key in active_relat["destinationId"].unique()])
        for _, v in active_relat.iterrows():
            if v['typeId'] == str(relationshipcode):
                _ = v['destinationId']
                relationship[_].append(v['sourceId'])
            else:
                pass
        output_dict = {
            key: output_dict.get(key, []) + relationship.get(key, [])
            for key in
            set(list(output_dict.keys()) + list(relationship.keys()))}
    with open(output_jsonfile, 'w') as json_file:
        json.dump(output_dict, json_file)
    return

to_concept_df

to_concept_df()

Create a SNOMED CT concept DataFrame.

Creates a SNOMED CT concept DataFrame ready for MEDCAT CDB creation. Checks if the version is a UK extension release and sets the correct file names for the concept and description snapshots accordingly. Additionally, handles the divergent release format of the UK Drug Extension >v2021 with the uk_drug_ext variable.

Returns:

  • pandas.DataFrame: SNOMED CT concept DataFrame.

Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
def to_concept_df(self):
    """
    Create a SNOMED CT concept DataFrame.

    Creates a SNOMED CT concept DataFrame ready for MEDCAT CDB creation.
    Checks if the version is a UK extension release and sets the correct
    file names for the concept and description snapshots accordingly.
    Additionally, handles the divergent release format of the UK Drug
    Extension >v2021 with the `uk_drug_ext` variable.

    Returns:
        pandas.DataFrame: SNOMED CT concept DataFrame.
    """

    df2merge = []
    for i, snomed_release in enumerate(self.snomed_releases):
        self._set_extension(snomed_release, self.exts[i])
        contents_path = os.path.join(
            self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept])
        exp_files = self._extension.value.exp_files
        concept_snapshot = exp_files.get_concept()
        description_snapshot = exp_files.get_description()
        if concept_snapshot is None or _IGNORE_TAG in concept_snapshot or (
                self.bundle and self.bundle.value.has_invalid(
                    self._extension, [RefSetFileType.concept,
                                      RefSetFileType.description])):
            continue

        for f in os.listdir(contents_path):
            m = re.search(f'{concept_snapshot}' + r'_(.*)_\d*.txt', f)
            if m:
                snomed_v = m.group(1)

        int_terms = parse_file(
            f'{contents_path}/'
            f'{concept_snapshot}_{snomed_v}_{snomed_release}.txt')
        active_terms = int_terms[int_terms.active == '1']
        del int_terms

        int_desc = parse_file(
            f'{contents_path}/{description_snapshot}_{snomed_v}_'
            f'{snomed_release}.txt')
        active_descs = int_desc[int_desc.active == '1']
        del int_desc

        _ = pd.merge(active_terms, active_descs,
                     left_on=['id'], right_on=['conceptId'],
                     how='inner')
        del active_terms
        del active_descs

        active_with_primary_desc = _[
            _['typeId'] == '900000000000003001']  # active description
        active_with_synonym_desc = _[
            _['typeId'] == '900000000000013009']  # active synonym
        del _
        active_with_all_desc = pd.concat(
            [active_with_primary_desc, active_with_synonym_desc])

        active_snomed_df = active_with_all_desc[['id_x', 'term', 'typeId']]
        del active_with_all_desc

        active_snomed_df = active_snomed_df.rename(
            columns={'id_x': 'cui', 'term': 'name',
                     'typeId': 'name_status'})
        active_snomed_df['ontologies'] = 'SNOMED-CT'
        active_snomed_df['name_status'] = active_snomed_df[
            'name_status'].replace(
                ['900000000000003001', '900000000000013009'],
                ['P', 'A'])
        active_snomed_df = active_snomed_df.reset_index(drop=True)

        temp_df = active_snomed_df[
            active_snomed_df['name_status'] == 'P'][['cui', 'name']]
        temp_df['description_type_ids'] = temp_df['name'].str.extract(
            r"\((\w+\s?.?\s?\w+.?\w+.?\w+.?)\)$")
        active_snomed_df = pd.merge(
            active_snomed_df,
            temp_df.loc[:, ['cui', 'description_type_ids']],
            on='cui',
            how='left')
        del temp_df

        # Hash semantic tag to get a 8 digit type_id code
        active_snomed_df['type_ids'] = (
            active_snomed_df['description_type_ids'].apply(
                lambda x: int(
                    hashlib.sha256(str(x).encode('utf-8')).hexdigest(),
                    16) % 10 ** 8))
        df2merge.append(active_snomed_df)

    return pd.concat(df2merge).reset_index(drop=True)

SupportedBundles

Bases: Enum

Attributes:

UK_CLIN class-attribute instance-attribute

UK_DRUG_EXT class-attribute instance-attribute

UK_DRUG_EXT = BundleDescriptor(extensions=[UK_DRUG, UK_EDITION])

SupportedExtension

Bases: Enum

Attributes:

AU class-attribute instance-attribute

AU = ExtensionDescription(exp_name_in_folder='Release', exp_2nd_part_in_folder='AU1000036', exp_files=FileFormatDescriptor(concept='Concept_Snapshot', description='Description_Snapshot-en-AU', relationship='Relationship_Snapshot', refset=_IGNORE_TAG))

INTERNATIONAL class-attribute instance-attribute

INTERNATIONAL = ExtensionDescription(exp_name_in_folder='InternationalRF2', exp_files=FileFormatDescriptor(concept='Concept_Snapshot', description='Description_Snapshot-en', relationship='Relationship_Snapshot', refset='der2_iisssccRefset_ExtendedMapSnapshot'))

UK_CLINICAL class-attribute instance-attribute

UK_CLINICAL = ExtensionDescription(exp_name_in_folder='UKClinicalRF2', exp_files=FileFormatDescriptor(concept='Concept_UKCLSnapshot', description='Description_UKCLSnapshot-en', relationship='Relationship_UKCLSnapshot', refset='der2_iisssciRefset_ExtendedMapUKCLSnapshot'))

UK_CLINICAL_REFSET class-attribute instance-attribute

UK_CLINICAL_REFSET = ExtensionDescription(exp_name_in_folder='UKClinicalRefsetsRF2', exp_files=ignore_all())

UK_DRUG class-attribute instance-attribute

UK_DRUG = ExtensionDescription(exp_name_in_folder='UKDrugRF2', exp_files=FileFormatDescriptor(concept='Concept_UKDGSnapshot', description='Description_UKDGSnapshot-en', relationship='Relationship_UKDGSnapshot', refset='der2_iisssciRefset_ExtendedMapUKDGSnapshot'))

UK_EDITION class-attribute instance-attribute

UK_EDITION = ExtensionDescription(exp_name_in_folder='UKEditionRF2', exp_files=FileFormatDescriptor(concept='Concept_UKEDSnapshot', description='Description_UKEDSnapshot-en', relationship='Relationship_UKEDSnapshot', refset='der2_iisssciRefset_ExtendedMapUKEDSnapshot'))

UnkownSnomedReleaseException

UnkownSnomedReleaseException(*args)

Bases: ValueError

Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
675
676
def __init__(self, *args) -> None:
    super().__init__(*args)

get_all_children

get_all_children(sctid, pt2ch)

Retrieves all the children of a given SNOMED CT ID (SCTID) from a given parent-to-child mapping (pt2ch) via the "IS A" relationship. pt2ch can be found in a MedCAT model in the additional info via the call: cat.cdb.addl_info['pt2ch']

Parameters:

  • sctid

    (int) –

    The SCTID whose children need to be retrieved.

  • pt2ch

    (dict) –

    A dictionary containing the parent-to-child elationships in the form {parent_sctid: [list of child sctids]}.

Returns:

  • list

    A list of unique SCTIDs that are children of the given SCTID.

Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def get_all_children(sctid, pt2ch):
    """
    Retrieves all the children of a given SNOMED CT ID (SCTID) from a given
    parent-to-child mapping (pt2ch) via the "IS A" relationship.
    pt2ch can be found in a MedCAT model in the additional info
    via the call: cat.cdb.addl_info['pt2ch']

    Args:
        sctid (int): The SCTID whose children need to be retrieved.
        pt2ch (dict): A dictionary containing the parent-to-child
            elationships in the form {parent_sctid: [list of child sctids]}.

    Returns:
        list: A list of unique SCTIDs that are children of the given SCTID.
    """
    result = []
    stack = [sctid]
    while len(stack) != 0:
        # remove the last element from the stack
        current_snomed = stack.pop()
        current_snomed_children = pt2ch.get(current_snomed, [])
        stack.extend(current_snomed_children)
        result.append(current_snomed)
    result = list(set(result))
    return result

get_direct_refset_mapping

get_direct_refset_mapping(in_dict: dict) -> dict

This method uses the output from Snomed.map_snomed2icd10 or Snomed.map_snomed2opcs4 and removes the metadata and maps each SNOMED CUI to the prioritised list of the target ontology CUIs.

The input dict is expected to be in the following format: - Keys are SnomedCT CUIs - The values are lists of dictionaries, each list item (at least) - Has a key 'code' that specifies the target onotlogy CUI - Has a key 'mapPriority' that specifies the priority

Parameters:

  • in_dict

    (dict) –

    The input dict.

Returns:

  • dict ( dict ) –

    The map from Snomed CUI to list of priorities list of target ontology CUIs.

Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def get_direct_refset_mapping(in_dict: dict) -> dict:
    """This method uses the output from Snomed.map_snomed2icd10 or
    Snomed.map_snomed2opcs4 and removes the metadata and maps each
    SNOMED CUI to the prioritised list of the target ontology CUIs.

    The input dict is expected to be in the following format:
    - Keys are SnomedCT CUIs
    - The values are lists of dictionaries, each list item (at least)
      - Has a key 'code' that specifies the target onotlogy CUI
      - Has a key 'mapPriority' that specifies the priority

    Args:
        in_dict (dict): The input dict.

    Returns:
        dict: The map from Snomed CUI to list of priorities list of target
            ontology CUIs.
    """
    ret_dict = dict()
    for k, vals in in_dict.items():
        # sort such that highest priority values are first
        svals = sorted(vals, key=lambda el: el['mapPriority'], reverse=True)
        # only keep the code / CUI
        ret_dict[k] = [v['code'] for v in svals]
    return ret_dict

match_partials_with_folders

match_partials_with_folders(exp_names: list[tuple[str, Optional[str]]], folder_names: list[str], _group_nr1: int = 1, _group_nr2: int = 2) -> bool
Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
def match_partials_with_folders(exp_names: list[tuple[str, Optional[str]]],
                                folder_names: list[str],
                                _group_nr1: int = 1, _group_nr2: int = 2
                                ) -> bool:
    if len(exp_names) > len(folder_names):
        return False
    available_folders = [os.path.basename(f) for f in folder_names]
    for exp_name, exp_name_p2 in exp_names:
        found_cur_name = False
        for fi, folder in enumerate(available_folders):
            m = SNOMED_FOLDER_NAME_PATTERN.match(folder)
            if not m:
                continue
            if m.group(_group_nr1) != exp_name:
                continue
            if exp_name_p2 and m.group(_group_nr2) != exp_name_p2:
                continue
            found_cur_name = True
            break
        if found_cur_name:
            available_folders.pop(fi)
        else:
            return False
    return True

parse_file

parse_file(filename, first_row_header=True, columns=None)
Source code in medcat-v2/medcat/model_creation/preprocess_snomed.py
11
12
13
14
15
def parse_file(filename, first_row_header=True, columns=None):
    with open(filename, encoding='utf-8') as f:
        entities = [[n.strip() for n in line.split('\t')] for line in f]
        return pd.DataFrame(
            entities[1:], columns=entities[0] if first_row_header else columns)