medcat.utils.data_utils

Classes:

TestTrainSplitter –

Functions:

get_false_positives –

Get the false positives within a trainer export.
make_mc_train_test –

Make train set.

TestTrainSplitter

TestTrainSplitter(data: MedCATTrainerExport, cdb: CDB, test_size: float = 0.2)

Methods:

split –

Attributes:

MAX_TEST_FRACTION –
MIN_CNT_FOR_TEST –
cdb –
data –
test_size –

Source code in medcat-v2/medcat/utils/data_utils.py

def __init__(self, data: MedCATTrainerExport, cdb: CDB,
             test_size: float = 0.2):
    self.data = data
    self.cdb = cdb
    self.test_size = test_size
    self._reset()

MAX_TEST_FRACTION `class-attribute` `instance-attribute`

MAX_TEST_FRACTION = 0.3

MIN_CNT_FOR_TEST `class-attribute` `instance-attribute`

MIN_CNT_FOR_TEST = 10

cdb `instance-attribute`

cdb = cdb

data `instance-attribute`

data = data

test_size `instance-attribute`

test_size = test_size

split

split() -> tuple[MedCATTrainerExport, MedCATTrainerExport, int, int]

Source code in medcat-v2/medcat/utils/data_utils.py

def split(self) -> tuple[MedCATTrainerExport, MedCATTrainerExport,
                         int, int]:
    # Count all CUIs
    for project in self.data['projects']:
        self._count_project(project)

    test_set: MedCATTrainerExport = {'projects': []}
    train_set: MedCATTrainerExport = {'projects': []}

    perm_arr: list[int] = cast(
        list[int],
        np.random.permutation(range(
            len(self.data['projects']))).tolist())

    for i_project in perm_arr:
        project = self.data['projects'][i_project]
        cui_filter = None

        # copy everything else, but reset documents list
        test_project: MedCATTrainerExportProject = project.copy()
        train_project: MedCATTrainerExportProject = project.copy()
        test_project['documents'] = []
        train_project['documents'] = []

        if 'cuis' in project and len(project['cuis'].strip()) > 0:
            cui_filter = [x.strip() for x in project['cuis'].split(",")]

        num_of_docs = len(project['documents'])
        for i_document in np.random.permutation(range(0, num_of_docs)):
            # Do we have enough documents in the test set
            if self.test_anns / self.total_anns >= self.test_size:
                continue
            document = project['documents'][i_document]
            self._split_doc_train_test(document, cui_filter,
                                       train_project, test_project)

        test_set['projects'].append(test_project)
        train_set['projects'].append(train_project)

    return train_set, test_set, self.test_anns, self.total_anns

get_false_positives

get_false_positives(doc: MedCATTrainerExportDocument, spacy_doc: MutableDocument) -> list[MutableEntity]

Get the false positives within a trainer export.

Parameters:

doc
(MedCATTrainerExportDocument) –

The trainer export.
spacy_doc
(MutableDocument) –

The annotated document.

Returns:

list[MutableEntity] –

list[MutableEntity]: The list of false positive entities.

Source code in medcat-v2/medcat/utils/data_utils.py

def get_false_positives(doc: MedCATTrainerExportDocument,
                        spacy_doc: MutableDocument
                        ) -> list[MutableEntity]:
    """Get the false positives within a trainer export.

    Args:
        doc (MedCATTrainerExportDocument): The trainer export.
        spacy_doc (MutableDocument): The annotated document.

    Returns:
        list[MutableEntity]: The list of false positive entities.
    """
    truth = set([(ent['start'], ent['cui']) for ent in doc['annotations']])

    fps = []
    for ent in spacy_doc.ner_ents:
        if (ent.base.start_index, ent.cui) not in truth:
            fps.append(ent)

    return fps

make_mc_train_test

make_mc_train_test(data: MedCATTrainerExport, cdb: CDB, test_size: float = 0.2) -> tuple

Make train set.

This is a disaster.

Parameters:

data
(MedCATTrainerExport) –

The data.
cdb
(CDB) –

The concept database.
test_size
(float, default: 0.2 ) –

The test size. Defaults to 0.2.

Returns:

tuple ( tuple ) –

The train set, the test set, the test annotations, and the total annotations

Source code in medcat-v2/medcat/utils/data_utils.py

def make_mc_train_test(data: MedCATTrainerExport,
                       cdb: CDB, test_size: float = 0.2) -> tuple:
    """Make train set.

    This is a disaster.

    Args:
        data (MedCATTrainerExport): The data.
        cdb (CDB): The concept database.
        test_size (float): The test size. Defaults to 0.2.

    Returns:
        tuple:
            The train set, the test set, the test annotations,
            and the total annotations
    """
    return TestTrainSplitter(data, cdb, test_size).split()

medcat.utils.data_utils

TestTrainSplitter

MAX_TEST_FRACTION `class-attribute` `instance-attribute`

MIN_CNT_FOR_TEST `class-attribute` `instance-attribute`

cdb `instance-attribute`

data `instance-attribute`

test_size `instance-attribute`

split

get_false_positives

`doc`

`spacy_doc`

make_mc_train_test

`data`

`cdb`

`test_size`

medcat.utils.data_utils

TestTrainSplitter

MAX_TEST_FRACTION class-attribute instance-attribute

MIN_CNT_FOR_TEST class-attribute instance-attribute

cdb instance-attribute

data instance-attribute

test_size instance-attribute

split

get_false_positives

doc

spacy_doc

make_mc_train_test

data

cdb

test_size

MAX_TEST_FRACTION `class-attribute` `instance-attribute`

MIN_CNT_FOR_TEST `class-attribute` `instance-attribute`

cdb `instance-attribute`

data `instance-attribute`

test_size `instance-attribute`

`doc`

`spacy_doc`

`data`

`cdb`

`test_size`