Skip to content

medcat.utils.data_utils

Classes:

Functions:

TestTrainSplitter

TestTrainSplitter(data: MedCATTrainerExport, cdb: CDB, test_size: float = 0.2)

Methods:

Attributes:

Source code in medcat-v2/medcat/utils/data_utils.py
15
16
17
18
19
20
def __init__(self, data: MedCATTrainerExport, cdb: CDB,
             test_size: float = 0.2):
    self.data = data
    self.cdb = cdb
    self.test_size = test_size
    self._reset()

MAX_TEST_FRACTION class-attribute instance-attribute

MAX_TEST_FRACTION = 0.3

MIN_CNT_FOR_TEST class-attribute instance-attribute

MIN_CNT_FOR_TEST = 10

cdb instance-attribute

cdb = cdb

data instance-attribute

data = data

test_size instance-attribute

test_size = test_size

split

Source code in medcat-v2/medcat/utils/data_utils.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def split(self) -> tuple[MedCATTrainerExport, MedCATTrainerExport,
                         int, int]:
    # Count all CUIs
    for project in self.data['projects']:
        self._count_project(project)

    test_set: MedCATTrainerExport = {'projects': []}
    train_set: MedCATTrainerExport = {'projects': []}

    perm_arr: list[int] = cast(
        list[int],
        np.random.permutation(range(
            len(self.data['projects']))).tolist())

    for i_project in perm_arr:
        project = self.data['projects'][i_project]
        cui_filter = None

        # copy everything else, but reset documents list
        test_project: MedCATTrainerExportProject = project.copy()
        train_project: MedCATTrainerExportProject = project.copy()
        test_project['documents'] = []
        train_project['documents'] = []

        if 'cuis' in project and len(project['cuis'].strip()) > 0:
            cui_filter = [x.strip() for x in project['cuis'].split(",")]

        num_of_docs = len(project['documents'])
        for i_document in np.random.permutation(range(0, num_of_docs)):
            # Do we have enough documents in the test set
            if self.test_anns / self.total_anns >= self.test_size:
                continue
            document = project['documents'][i_document]
            self._split_doc_train_test(document, cui_filter,
                                       train_project, test_project)

        test_set['projects'].append(test_project)
        train_set['projects'].append(train_project)

    return train_set, test_set, self.test_anns, self.total_anns

get_false_positives

Get the false positives within a trainer export.

Parameters:

Returns:

Source code in medcat-v2/medcat/utils/data_utils.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def get_false_positives(doc: MedCATTrainerExportDocument,
                        spacy_doc: MutableDocument
                        ) -> list[MutableEntity]:
    """Get the false positives within a trainer export.

    Args:
        doc (MedCATTrainerExportDocument): The trainer export.
        spacy_doc (MutableDocument): The annotated document.

    Returns:
        list[MutableEntity]: The list of false positive entities.
    """
    truth = set([(ent['start'], ent['cui']) for ent in doc['annotations']])

    fps = []
    for ent in spacy_doc.ner_ents:
        if (ent.base.start_index, ent.cui) not in truth:
            fps.append(ent)

    return fps

make_mc_train_test

make_mc_train_test(data: MedCATTrainerExport, cdb: CDB, test_size: float = 0.2) -> tuple

Make train set.

This is a disaster.

Parameters:

  • data

    (MedCATTrainerExport) –

    The data.

  • cdb

    (CDB) –

    The concept database.

  • test_size

    (float, default: 0.2 ) –

    The test size. Defaults to 0.2.

Returns:

  • tuple ( tuple ) –

    The train set, the test set, the test annotations, and the total annotations

Source code in medcat-v2/medcat/utils/data_utils.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def make_mc_train_test(data: MedCATTrainerExport,
                       cdb: CDB, test_size: float = 0.2) -> tuple:
    """Make train set.

    This is a disaster.

    Args:
        data (MedCATTrainerExport): The data.
        cdb (CDB): The concept database.
        test_size (float): The test size. Defaults to 0.2.

    Returns:
        tuple:
            The train set, the test set, the test annotations,
            and the total annotations
    """
    return TestTrainSplitter(data, cdb, test_size).split()