Skip to content

medcat.model_creation.preprocess_umls

Classes:

  • UMLS

    Pre-process UMLS release files:

Attributes:

all_vals module-attribute

all_vals = [(len(v)) for v in (values())]

df module-attribute

df = to_concept_df()

medcat_csv_mapper module-attribute

medcat_csv_mapper: dict = {'CUI': 'cui', 'STR': 'name', 'SAB': 'ontologies', 'ISPREF': 'name_status', 'TUI': 'type_ids'}

pt2ch module-attribute

pt2ch = get_pt2ch()

random_4_keys module-attribute

random_4_keys = sample(list(keys()), k=4)

save_file module-attribute

save_file = 'preprocessed_umls.csv'

to_ICD10 module-attribute

to_ICD10 = map_umls2icd10()

to_ICD10_man module-attribute

to_ICD10_man = map_umls2source(sources=['ICD10'])

to_snomed module-attribute

to_snomed = map_umls2snomed()

umls module-attribute

umls = UMLS(argv[1], argv[2])

UMLS

UMLS(main_file_name: str, sem_types_file: str, allow_languages: list = ['ENG'], sep: str = '|')

Pre-process UMLS release files: Args: main_file_name (str): Path to the main file name (probably MRCONSO.RRF) sem_types_file (str): Path to the semantic types file name (probably MRSTY.RRF) allow_langugages (list): Languages to filter out. Defaults to just English (['ENG']). sep (str): The separator used within the files. Defaults to '|'.

Methods:

Attributes:

Source code in medcat-v2/medcat/model_creation/preprocess_umls.py
72
73
74
75
76
77
78
79
80
81
82
def __init__(self, main_file_name: str, sem_types_file: str,
             allow_languages: list = ['ENG'], sep: str = '|'):
    self.main_file_name = main_file_name
    self.sem_types_file = sem_types_file
    self.main_columns = list(_DEFAULT_COLUMNS)  # copy
    self.sem_types_columns = list(_DEFAULT_SEM_TYPE_COLUMNS)  # copy
    self.mrhier_columns = list(_DEFAULT_MRHIER_COLUMNS)  # copy
    self.sep = sep
    # copy in case of default list
    self.allow_langugages = list(
        allow_languages) if allow_languages else allow_languages

allow_langugages instance-attribute

allow_langugages = list(allow_languages) if allow_languages else allow_languages

main_columns instance-attribute

main_columns = list(_DEFAULT_COLUMNS)

main_file_name instance-attribute

main_file_name = main_file_name

mrhier_columns instance-attribute

mrhier_columns = list(_DEFAULT_MRHIER_COLUMNS)

sem_types_columns instance-attribute

sem_types_columns = list(_DEFAULT_SEM_TYPE_COLUMNS)

sem_types_file instance-attribute

sem_types_file = sem_types_file

sep instance-attribute

sep = sep

get_pt2ch

get_pt2ch() -> dict

Generates a parent to children dict.

It goes through all the < # TODO

The resulting dictionary maps a CUI to a list of CUIs that consider that CUI as their parent.

PS: This expects the MRHIER.RRF file to also exist in the same folder as the MRCONSO.RRF file.

Raises:

  • ValueError

    If the MRHIER.RRF file wasn't found

Returns:

  • dict ( dict ) –

    The dictionary of parent CUI and their children.

Source code in medcat-v2/medcat/model_creation/preprocess_umls.py
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
def get_pt2ch(self) -> dict:
    """Generates a parent to children dict.

    It goes through all the < # TODO

    The resulting dictionary maps a CUI to a list of CUIs that
    consider that CUI as their parent.

    PS:
    This expects the MRHIER.RRF file to also exist in the same folder
    as the MRCONSO.RRF file.

    Raises:
        ValueError: If the MRHIER.RRF file wasn't found

    Returns:
        dict: The dictionary of parent CUI and their children.
    """
    path = self.main_file_name.rsplit('/', 1)[0]
    hier_file = f"{path}/MRHIER.RRF"

    if not os.path.exists(hier_file):
        raise ValueError(
            'Expected MRHIER.RRF to exist within the same parent folder '
            f'({path})')

    conso_df = pd.read_csv(self.main_file_name, names=self.main_columns,
                           sep=self.sep, index_col=False)

    hier_df = pd.read_csv(hier_file, sep=self.sep, index_col=False,
                          header=None, names=self.mrhier_columns)

    # filter languages
    if self.allow_langugages:
        conso_df = conso_df[conso_df["LAT"].isin(self.allow_langugages)]

    # create a AUI -> CUI map
    aui_cui = dict(zip(conso_df["AUI"], conso_df["CUI"]))

    # remove non-preferred from conso
    conso_df = conso_df[conso_df['ISPREF'] == 'Y']

    # filter ISA relationships
    hier_df = hier_df[hier_df['RELA'] == 'isa']

    # merge dataframes
    merged_df = pd.merge(conso_df, hier_df, on=['AUI', 'CUI'])

    # only keep CUI and parent AUI
    cui_parent = merged_df[['CUI', 'PAUI']]
    # only include CUIs with a parent
    cui_parent = cui_parent[cui_parent['PAUI'].notna()]

    # create dict
    pt2ch: dict = {}
    for _, row in tqdm.tqdm(cui_parent.iterrows(),
                            total=len(cui_parent.index)):
        cur_cui = row['CUI']
        paui = row['PAUI']
        parent_cui = aui_cui[paui]
        # avoid self as parent/child
        if parent_cui == cur_cui:
            continue
        if parent_cui not in pt2ch:
            pt2ch[parent_cui] = set()
        pt2ch[parent_cui].add(cur_cui)
    # move from set to list for consistency with SNOMED
    pt2ch: Dict[str, List[str]] = pt2ch  # type: ignore
    for k, v in pt2ch.items():
        pt2ch[k] = list(v)
    return pt2ch

map_umls2icd10

map_umls2icd10() -> DataFrame

Map to ICD-10.

Available SAB's that contain 'ICD10': - CCSR_ICD10CM - CCSR_ICD10CM (Clinical Classifications Software Refined for ICD-10-CM) - Synopsis - CCSR_ICD10PCS - CCSR_ICD10PCS (Clinical Classifications Software Refined for ICD-10-PCS) - Synopsis - DMDICD10 - DMDICD10 (ICD-10 German) - Statistics - ICD10AE - ICD10AE (ICD-10, American English Equivalents) - Synopsis - ICD10AMAE - ICD10AMAE (ICD-10, Australian Modification, Americanized English Equivalents) - Synopsis - ICD10AM - ICD10AM (ICD-10, Australian Modification) - Synopsis - ICD10DUT - ICD10DUT (ICD10, Dutch Translation) - Synopsis - ICD10PCS - ICD10PCS (ICD-10 Procedure Coding System) - Synopsis - ICD10 - ICD10 (International Classification of Diseases and Related Health Problems, Tenth Revision) - Synopsis - ICPC2ICD10DUT - ICPC2ICD10DUT (ICPC2-ICD10 Thesaurus, Dutch Translation) - Synopsis - ICPC2ICD10ENG - ICPC2ICD10ENG (ICPC2-ICD10 Thesaurus) - Synopsis - MTHICPC2ICD10AE - MTHICPC2ICD10AE (ICPC2E-ICD10 Thesaurus, American English Equivalents) - Synopsis

Currently only using 'ICD10'. But others may be relevant as well.

If one wants to use one of the other sources listed above, they would need to use the map_umls2source method.

Returns:

  • DataFrame

    pd.DataFrame: DataFrame that has the ICD-10 codes

Source code in medcat-v2/medcat/model_creation/preprocess_umls.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
def map_umls2icd10(self) -> pd.DataFrame:
    """Map to ICD-10.

    Available SAB's that contain 'ICD10':
     - CCSR_ICD10CM - CCSR_ICD10CM (Clinical Classifications Software Refined for ICD-10-CM) - Synopsis
     - CCSR_ICD10PCS - CCSR_ICD10PCS (Clinical Classifications Software Refined for ICD-10-PCS) - Synopsis
     - DMDICD10 - DMDICD10 (ICD-10 German) - Statistics
     - ICD10AE - ICD10AE (ICD-10, American English Equivalents) - Synopsis
     - ICD10AMAE - ICD10AMAE (ICD-10, Australian Modification, Americanized English Equivalents) - Synopsis
     - ICD10AM - ICD10AM (ICD-10, Australian Modification) - Synopsis
     - ICD10DUT - ICD10DUT (ICD10, Dutch Translation) - Synopsis
     - ICD10PCS - ICD10PCS (ICD-10 Procedure Coding System) - Synopsis
     - ICD10 - ICD10 (International Classification of Diseases and Related Health Problems, Tenth Revision) - Synopsis
     - ICPC2ICD10DUT - ICPC2ICD10DUT (ICPC2-ICD10 Thesaurus, Dutch Translation) - Synopsis
     - ICPC2ICD10ENG - ICPC2ICD10ENG (ICPC2-ICD10 Thesaurus) - Synopsis
     - MTHICPC2ICD10AE - MTHICPC2ICD10AE (ICPC2E-ICD10 Thesaurus, American English Equivalents) - Synopsis

    Currently only using 'ICD10'. But others may be relevant as well.

    If one wants to use one of the other sources listed above,
    they would need to use the map_umls2source method.

    Returns:
        pd.DataFrame: DataFrame that has the ICD-10 codes
    """  # noqa
    return self.map_umls2source(sources='ICD10')

map_umls2snomed

map_umls2snomed() -> DataFrame

Map to SNOMED-CT.

Currently, uses the SCUI column. At the time of writing, this is equal to the CODE column. But this may not be the case in the future.

Returns:

  • DataFrame

    pd.DataFrame: Dataframe that contains the SCUI (source CUI) as well as the UMLS CUI for each applicable concept

Source code in medcat-v2/medcat/model_creation/preprocess_umls.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def map_umls2snomed(self) -> pd.DataFrame:
    """Map to SNOMED-CT.

    Currently, uses the SCUI column. At the time of writing, this is equal
    to the CODE column.
    But this may not be the case in the future.

    Returns:
        pd.DataFrame: Dataframe that contains the SCUI (source CUI) as
            well as the UMLS CUI for each applicable concept
    """
    df = pd.read_csv(self.main_file_name, names=self.main_columns,
                     sep=self.sep, index_col=False, dtype={'SCUI': 'str'})
    # get only SNOMED-CT US based concepts that have a SNOMED-CT (source)
    # CUI
    df = df[df.SAB == 'SNOMEDCT_US'][df.SCUI.notna()]
    # sort by SCUI
    df = df.sort_values(by='SCUI').reset_index(drop=True)
    # rearrange with SCUI as the first column
    df = df[['SCUI',] + [
        col for col in df.columns.values if col != 'SCUI']]
    return df

map_umls2source

map_umls2source(sources: Union[str, List[str]]) -> DataFrame

Allows mapping to an arbitrary

Parameters:

Returns:

  • DataFrame

    pd.DataFrame: DataFrame that has the target source codes

Source code in medcat-v2/medcat/model_creation/preprocess_umls.py
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
def map_umls2source(self, sources: Union[str, List[str]]) -> pd.DataFrame:
    """Allows mapping to an arbitrary

    Args:
        sources (Union[str, List[str]]): The source or sources to include.

    Returns:
        pd.DataFrame: DataFrame that has the target source codes
    """
    df = pd.read_csv(self.main_file_name, names=self.main_columns,
                     sep=self.sep, index_col=False, dtype={'CODE': 'str'})
    # get the specified source(s)
    if isinstance(sources, list):
        df = df[df.SAB.isin(sources)][df.CODE.notna()]
    else:
        df = df[df.SAB == sources][df.CODE.notna()]
    # sort by CODE
    df = df.sort_values(by='CODE').reset_index(drop=True)
    # rearrange columns starting with CODE
    df = df[['CODE',] + [
        col for col in df.columns.values if col != 'CODE']]
    return df

to_concept_df

to_concept_df() -> DataFrame

Create a concept DataFrame. The default column names are expected.

Returns:

  • DataFrame

    pd.DataFrame: The resulting DataFrame

Source code in medcat-v2/medcat/model_creation/preprocess_umls.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def to_concept_df(self) -> pd.DataFrame:
    """Create a concept DataFrame.
    The default column names are expected.

    Returns:
        pd.DataFrame: The resulting DataFrame
    """
    # target columns:
    # cui, name, name_status, ontologies, description_type_ids, type_ids
    df = pd.read_csv(self.main_file_name,
                     names=self.main_columns, sep=self.sep,
                     index_col=False)

    # filter languages
    if self.allow_langugages:
        df = df[df["LAT"].isin(self.allow_langugages)]

    # TODO filter by activity ?

    # get TUI

    sem_types = pd.read_csv(
        self.sem_types_file, names=self.sem_types_columns, sep=self.sep,
        index_col=False)
    df = df.merge(sem_types)

    # rename columns

    df = df.rename(columns=medcat_csv_mapper)

    # pop all unnecessary columns

    # all initial columns should have been renamed
    for col_name in self.main_columns + self.sem_types_columns:
        if col_name in df.columns:
            df.pop(col_name)

    # looks like description_type_ids is not really used anywhere,
    # so I won't look for it

    return df