Skip to content

medcat.utils.legacy.conversion_all

Classes:

  • Converter

    Converts v1 models to v2 models.

Functions:

  • unpack

    Unpack v1 model into target folder.

Attributes:

logger module-attribute

logger = getLogger(__name__)

Converter

Converter(medcat1_model_pack_path: str, new_model_pack_path: Optional[str], ser_type: AvailableSerialisers = dill)

Converts v1 models to v2 models.

Methods:

  • convert

    Use the gathered information to convert to a v2 model.

Attributes:

Source code in medcat-v2/medcat/utils/legacy/conversion_all.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def __init__(self, medcat1_model_pack_path: str,
             new_model_pack_path: Optional[str],
             ser_type: AvailableSerialisers = AvailableSerialisers.dill):
    if medcat1_model_pack_path.endswith(".zip"):
        folder_path = medcat1_model_pack_path[:-4]
        self.old_model_name = os.path.split(
            medcat1_model_pack_path)[1].rsplit(".zip", 1)[0]
        unpack(medcat1_model_pack_path, folder_path)
        medcat1_model_pack_path = folder_path
    else:
        self.old_model_name = os.path.split(medcat1_model_pack_path)[1]
    if not os.path.isdir(medcat1_model_pack_path):
        raise ValueError(
            "Provided model path is not a directory: "
            f"{medcat1_model_pack_path}")
    self.old_model_folder = medcat1_model_pack_path
    self.new_model_folder = new_model_pack_path
    self.ser_type = ser_type
    self._validate()

cdb_name class-attribute instance-attribute

cdb_name = 'cdb.dat'

config_name class-attribute instance-attribute

config_name = 'config.json'

expected_files_in_folder property

expected_files_in_folder

The base names of the required files in a folder for a v1 model.

new_model_folder instance-attribute

new_model_folder = new_model_pack_path

old_model_folder instance-attribute

old_model_folder = medcat1_model_pack_path

old_model_name instance-attribute

old_model_name = rsplit('.zip', 1)[0]

ser_type instance-attribute

ser_type = ser_type

vocab_name class-attribute instance-attribute

vocab_name = 'vocab.dat'

convert

convert() -> CAT

Use the gathered information to convert to a v2 model.

This converts the CDB, Vocab, and Config, in order and then created the model pack.

If self.new_model_folder is set, the model will be saved as well.

Returns:

  • CAT ( CAT ) –

    The model pack.

Source code in medcat-v2/medcat/utils/legacy/conversion_all.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def convert(self) -> CAT:
    """Use the gathered information to convert to a v2 model.

    This converts the CDB, Vocab, and Config, in order and then
    created the model pack.

    If `self.new_model_folder` is set, the model will be saved as well.

    Returns:
        CAT: The model pack.
    """
    cdb = get_cdb_from_old(
        os.path.join(self.old_model_folder, self.cdb_name),
        fix_spacy_model_name=False)
    vocab_path = os.path.join(self.old_model_folder, self.vocab_name)
    if os.path.exists(vocab_path):
        vocab = get_vocab_from_old(vocab_path)
    else:
        # e.g in case of DeID
        vocab = None
    cnf_path = os.path.join(self.old_model_folder, self.config_name)
    if os.path.exists(cnf_path):
        config = get_config_from_old(cnf_path)
    else:
        config = cdb.config
    with temp_changed_config(
        config.general.nlp, "modelname",
        os.path.join(self.old_model_folder,
                     config.general.nlp.modelname)):
        cat = CAT(cdb, vocab, config)
    # NOTE: its probably easier if we change the spacy model name
    #       afterwards
    fix_spacy_model_name(config, cat.pipe.tokenizer)
    fix_subnames(cat)
    # MetaCATs
    meta_cat_folders = [
        os.path.join(self.old_model_folder, subfolder)
        for subfolder in os.listdir(self.old_model_folder)
        if subfolder.startswith("meta_")
    ]
    if meta_cat_folders:
        from medcat.utils.legacy.convert_meta_cat import (
            get_meta_cat_from_old)
        for subfolder in meta_cat_folders:
            mc = get_meta_cat_from_old(subfolder, cat.pipe.tokenizer)
            cat.add_addon(mc)

    # RelCATs
    rel_cats_folders = [
        os.path.join(self.old_model_folder, subfolder)
        for subfolder in os.listdir(self.old_model_folder)
        if subfolder.startswith("rel_")
    ]
    if rel_cats_folders:
        from medcat.utils.legacy.convert_rel_cat import (
            get_rel_cat_from_old)
        for subfolder in rel_cats_folders:
            rel_cat = get_rel_cat_from_old(
                cdb, subfolder, cat.pipe.tokenizer)
            cat.add_addon(rel_cat)

    # DeID / TransformersNER
    trf_folders = [
        os.path.join(self.old_model_folder, subfolder)
        for subfolder in os.listdir(self.old_model_folder)
        if subfolder.startswith("trf_")
    ]
    if trf_folders:
        from medcat.utils.legacy.convert_deid import get_trf_ner_from_old
        trf_ners = [
            get_trf_ner_from_old(subfolder, cat.pipe.tokenizer)
            for subfolder in trf_folders
        ]
        if len(trf_ners) > 1:
            raise ValueError("Cannot use more than 1 tranformers NER. "
                             f"Got {len(trf_ners)}")
        logger.info("Found a Transformers based NER component "
                    "- probably for DeID")
        trf_ner = trf_ners[0]
        # update the TrfNER-loaded CDB with the correct config
        trf_ner._component.cdb.config = config
        # update the config with the correct NER custom config
        trf_cnf = trf_ner._component.config
        config.components.ner.custom_cnf = trf_cnf
        # update the component name
        config.components.ner.comp_name = trf_ner.name
        # replace component in pipeline
        # get the index of component in list
        index = next((c_num for c_num, comp in
                      enumerate(cat.pipe._components)
                      if comp.get_type() is CoreComponentType.ner))
        # set / change / replace the NER component
        logger.info(f"Changing the NER component in the pipe to {trf_ner}")
        cat.pipe._components[index] = trf_ner
        # replace linker to no-action linker
        config.components.linking.comp_name = 'no_action'
        index_link = next(
            (c_num for c_num, comp in enumerate(cat.pipe._components)
             if comp.get_type() is CoreComponentType.linking))
        # set / change / replace Linker to no-action linker
        logger.info("Changing the linking component in the pipe to a "
                    "no-action linker")
        cat.pipe._components[index_link] = NoActionLinker()

    if self.new_model_folder:
        logger.info("Saving converted model to '%s'",
                    self.new_model_folder)
        cat.save_model_pack(self.new_model_folder,
                            pack_name=self.old_model_name + 'v2',
                            serialiser_type=self.ser_type)
    return cat

unpack

Unpack v1 model into target folder.

Parameters:

  • model_zip_path

    (str) –

    ZIP path.

  • target_folder

    (str) –

    Target folder.

Source code in medcat-v2/medcat/utils/legacy/conversion_all.py
173
174
175
176
177
178
179
180
def unpack(model_zip_path: str, target_folder: str):
    """Unpack v1 model into target folder.

    Args:
        model_zip_path (str): ZIP path.
        target_folder (str): Target folder.
    """
    shutil.unpack_archive(model_zip_path, extract_dir=target_folder)