Skip to content

medcat.components.addons.meta_cat

Modules:

Classes:

Functions:

MetaAnnotationValue

Bases: TypedDict

Attributes:

confidence instance-attribute

confidence: float

name instance-attribute

name: str

value instance-attribute

value: str

MetaCAT

MetaCAT(tokenizer: Optional[TokenizerWrapperBase] = None, embeddings: Optional[Union[Tensor, ndarray]] = None, config: Optional[ConfigMetaCAT] = None, _model_state_dict: Optional[dict[str, Any]] = None, save_dir_path: Optional[str] = None)

Bases: AbstractSerialisable

The MetaCAT class used for training 'Meta-Annotation' models, i.e. annotations of clinical concept annotations. These are also known as properties or attributes of recognise entities sin similar tools such as MetaMap and cTakes.

This is a flexible model agnostic class that can learns any meta-annotation task, i.e. any multi-class classification task for recognised terms.

Parameters:

  • tokenizer

    (TokenizerWrapperBase, default: None ) –

    The Huggingface tokenizer instance. This can be a pre-trained tokenzier instance from a BERT-style model, or trained from scratch for the Bi-LSTM (w. attention) model that is currentl used in most deployments.

  • embeddings

    ((Tensor, ndarray), default: None ) –

    embedding mapping (sub)word input id n-dim (sub)word embedding.

  • config

    (ConfigMetaCAT, default: None ) –

    the configuration for MetaCAT. Param descriptions available in ConfigMetaCAT docs.

Methods:

Attributes:

Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
def __init__(self,
             tokenizer: Optional[TokenizerWrapperBase] = None,
             embeddings: Optional[Union[Tensor, numpy.ndarray]] = None,
             config: Optional[ConfigMetaCAT] = None,
             _model_state_dict: Optional[dict[str, Any]] = None,
             save_dir_path: Optional[str] = None) -> None:
    if config is None:
        config = ConfigMetaCAT()
    self.config = config
    self.save_dir_path = save_dir_path
    set_all_seeds(config.general.seed)

    self.tokenizer = tokenizer
    if tokenizer is not None:
        self._reset_tokenizer_info()

    self.embeddings = (torch.tensor(
        embeddings, dtype=torch.float32) if embeddings is not None
        else None)
    self.model = self.get_model(embeddings=self.embeddings)
    if _model_state_dict:
        self.model.load_state_dict(_model_state_dict)

config instance-attribute

config = config

embeddings instance-attribute

embeddings = tensor(embeddings, dtype=float32) if embeddings is not None else None

model instance-attribute

model = get_model(embeddings=embeddings)

name class-attribute instance-attribute

name = 'meta_cat'

save_dir_path instance-attribute

save_dir_path = save_dir_path

tokenizer instance-attribute

tokenizer = tokenizer

batch_generator staticmethod

Generator for batch of documents.

Parameters:

Yields:

Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
@staticmethod
def batch_generator(stream: Iterable[MutableDocument],
                    batch_size_chars: int
                    ) -> Iterable[list[MutableDocument]]:
    """Generator for batch of documents.

    Args:
        stream (Iterable[MutableDocument]):
            The document stream
        batch_size_chars (int):
            Number of characters per batch

    Yields:
        list[MutableDocument]: The batch of documents.
    """
    docs = []
    char_count = 0
    for doc in stream:
        char_count += len(doc.base.text)
        docs.append(doc)
        if char_count < batch_size_chars:
            continue
        yield docs
        docs = []
        char_count = 0

    # If there is anything left return that also
    if len(docs) > 0:
        yield docs

eval

Evaluate from json.

Parameters:

  • json_path

    (str) –

    The json file ath

Returns:

Raises:

Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
def eval(self, json_path: str) -> EvalModelResults:
    """Evaluate from json.

    Args:
        json_path (str):
            The json file ath

    Returns:
        EvalModelResults:
            The resulting model dict

    Raises:
        AssertionError: If self.tokenizer
        Exception: If the category name does not exist
    """
    g_config = self.config.general
    t_config = self.config.train

    with open(json_path, 'r') as f:
        data_loaded: dict = json.load(f)

    # Prepare the data
    assert self.tokenizer is not None
    data_in = prepare_from_json(
        data_loaded, g_config.cntx_left, g_config.cntx_right,
        self.tokenizer, cui_filter=t_config.cui_filter,
        replace_center=g_config.replace_center,
        prerequisites=t_config.prerequisites, lowercase=g_config.lowercase)

    # Check is the name there
    category_name = g_config.get_applicable_category_name(data_in)
    if category_name is None:
        raise Exception(
            "The category name does not exist in this json file.")

    data = data_in[category_name]

    # We already have everything, just get the data
    category_value2id = g_config.category_value2id
    data, _, _ = encode_category_values(
        data, existing_category_value2id=category_value2id)

    # Run evaluation
    assert self.tokenizer is not None
    result = eval_model(self.model, data, config=self.config,
                        tokenizer=self.tokenizer)

    return result

get_ents

Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
683
684
685
def get_ents(self, doc: MutableDocument) -> Iterable[MutableEntity]:
    # TODO - use span groups?
    return doc.ner_ents  # TODO: is this correct?

get_hash

get_hash() -> str

A partial hash trying to catch differences between models.

Returns:

  • str ( str ) –

    The hex hash.

Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
387
388
389
390
391
392
393
394
395
396
397
398
399
def get_hash(self) -> str:
    """A partial hash trying to catch differences between models.

    Returns:
        str: The hex hash.
    """
    hasher = Hasher()
    # Set last_train_on if None
    if self.config.train.last_train_on is None:
        self.config.train.last_train_on = datetime.now().timestamp()

    hasher.update(self.config.model_dump())
    return hasher.hexdigest()

get_init_attrs classmethod

get_init_attrs() -> list[str]
Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
295
296
297
298
@classmethod
def get_init_attrs(cls) -> list[str]:
    return ['tokenizer', 'embeddings', 'config',
            '_model_state_dict']

get_model

get_model(embeddings: Optional[Tensor]) -> Module

Get the model

Parameters:

  • embeddings

    (Optional[Tensor]) –

    The embedding densor

Raises:

  • ValueError –

    If the meta model is not LSTM or BERT

Returns:

  • Module –

    nn.Module: The module

Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
def get_model(self, embeddings: Optional[Tensor]) -> nn.Module:
    """Get the model

    Args:
        embeddings (Optional[Tensor]):
            The embedding densor

    Raises:
        ValueError: If the meta model is not LSTM or BERT

    Returns:
        nn.Module:
            The module
    """
    config = self.config
    if config.model.model_name == 'lstm':
        from medcat.components.addons.meta_cat.models import LSTM
        model: nn.Module = LSTM(embeddings, config)
        logger.info("LSTM model used for classification")

    elif config.model.model_name == 'bert':
        from medcat.components.addons.meta_cat.models import (
            BertForMetaAnnotation)
        model = BertForMetaAnnotation(config, self.save_dir_path)

        if not config.model.model_freeze_layers:
            peft_config = LoraConfig(
                task_type=TaskType.SEQ_CLS, inference_mode=False, r=8,
                lora_alpha=16, target_modules=["query", "value"],
                lora_dropout=0.2)

            # NOTE: Not sure what changed between transformers 4.50.3 and
            # 4.50.1 that made this fail for mypy. But as best as I can
            # tell, it still works just the same
            model = get_peft_model(model, peft_config)  # type: ignore
            # model.print_trainable_parameters()

        logger.info("BERT model used for classification")

    else:
        raise ValueError("Unknown model name %s" % config.model.model_name)

    return model

get_model_card

get_model_card(as_dict: Literal[True]) -> dict
get_model_card(as_dict: Literal[False]) -> str
get_model_card(as_dict: bool = False) -> Union[str, dict]

A minimal model card.

Parameters:

  • as_dict

    (bool, default: False ) –

    Return the model card as a dictionary instead of a str. Defaults to False.

Returns:

  • Union[str, dict] –

    Union[str, dict]: An indented JSON object. OR A JSON object in dict form.

Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
def get_model_card(self, as_dict: bool = False) -> Union[str, dict]:
    """A minimal model card.

    Args:
        as_dict (bool):
            Return the model card as a dictionary instead of a str.
            Defaults to `False`.

    Returns:
        Union[str, dict]:
            An indented JSON object.
            OR A JSON object in dict form.
    """
    card = {
        'Category Name': self.config.general.category_name,
        'Description': self.config.general.description,
        'Classes': self.config.general.category_value2id,
        'Model': self.config.model.model_name
    }
    if as_dict:
        return card
    else:
        return json.dumps(card, indent=2, sort_keys=False)

ignore_attrs classmethod

ignore_attrs() -> list[str]
Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
300
301
302
@classmethod
def ignore_attrs(cls) -> list[str]:
    return ['model', 'save_dir_path']

include_properties classmethod

include_properties() -> list[str]
Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
304
305
306
@classmethod
def include_properties(cls) -> list[str]:
    return ['_model_state_dict']

prepare_document

Prepares document.

Parameters:

  • doc

    (Doc) –

    The document

  • input_ids

    (list) –

    Input ids

  • offset_mapping

    (list) –

    Offset mappings

  • lowercase

    (bool) –

    Whether to use lower case replace center

Returns:

  • tuple[dict, list] –

    tuple[dict, list]: Entity id to index mapping and Samples

Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
def prepare_document(self, doc: MutableDocument, input_ids: list,
                     offset_mapping: list, lowercase: bool
                     ) -> tuple[dict, list]:
    """Prepares document.

    Args:
        doc (Doc):
            The document
        input_ids (list):
            Input ids
        offset_mapping (list):
            Offset mappings
        lowercase (bool):
            Whether to use lower case replace center

    Returns:
        tuple[dict, list]:
            Entity id to index mapping
            and
            Samples
    """
    config = self.config
    cntx_left = config.general.cntx_left
    cntx_right = config.general.cntx_right
    replace_center = config.general.replace_center

    ents = self.get_ents(doc)

    samples = []
    last_ind = 0
    # Map form entity ID to where is it in the samples array
    ent_id2ind = {}
    for ent in sorted(ents, key=lambda ent: ent.base.start_char_index):
        start = ent.base.start_char_index
        end = ent.base.end_char_index

        # Updated implementation to extract all the tokens for
        # the medical entity (rather than the one)
        ctoken_idx = []
        for ind, pair in enumerate(offset_mapping[last_ind:]):
            # Checking if we've reached at the start of the entity
            if start <= pair[0] or start <= pair[1]:
                if end <= pair[1]:
                    # End reached; update for correct index
                    ctoken_idx.append(last_ind + ind)
                    break
                else:
                    # Keep going; update for correct index
                    ctoken_idx.append(last_ind + ind)

        # Start where the last ent was found, cannot be before it as we've
        # sorted
        last_ind += ind  # If we did not start from 0 in the for loop

        _start = max(0, ctoken_idx[0] - cntx_left)
        _end = min(len(input_ids), ctoken_idx[-1] + 1 + cntx_right)

        tkns = input_ids[_start:_end]
        cpos = cntx_left + min(0, ind - cntx_left)
        cpos_new = [x - _start for x in ctoken_idx]

        if replace_center is not None:
            if lowercase:
                replace_center = replace_center.lower()
            # We start from ind
            s_ind = ind
            e_ind = ind
            for _ind, pair in enumerate(offset_mapping[ind:]):
                if end > pair[0] and end <= pair[1]:
                    e_ind = _ind + ind
                    break
            ln = e_ind - s_ind  # Length of the concept in tokens
            assert self.tokenizer is not None
            tkns = tkns[:cpos] + self.tokenizer(
                replace_center)['input_ids'] + tkns[cpos + ln + 1:]
        samples.append([tkns, cpos_new])
        ent_id2ind[ent.id] = len(samples) - 1

    return ent_id2ind, samples

train_from_json

train_from_json(json_path: Union[str, list], save_dir_path: Optional[str] = None, data_oversampled: Optional[list] = None, overwrite: bool = False) -> dict

Train or continue training a model give a json_path containing a MedCATtrainer export. It will continue training if an existing model is loaded or start new training if the model is blank/new.

Parameters:

  • json_path

    (Union[str, list]) –

    Path/Paths to a MedCATtrainer export containing the meta_annotations we want to train for.

  • save_dir_path

    (Optional[str], default: None ) –

    In case we have aut_save_model (meaning during the training the best model will be saved) we need to set a save path. Defaults to None.

  • data_oversampled

    (Optional[list], default: None ) –

    In case of oversampling being performed, the data will be passed in the parameter allowing the model to be trained on original + synthetic data.

  • overwrite

    (bool, default: False ) –

    Whether to allow overwriting the file if/when appropriate.

Returns:

  • dict ( dict ) –

    The resulting report.

Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
def train_from_json(self, json_path: Union[str, list],
                    save_dir_path: Optional[str] = None,
                    data_oversampled: Optional[list] = None,
                    overwrite: bool = False) -> dict:
    """Train or continue training a model give a json_path containing
    a MedCATtrainer export. It will continue training if an existing
    model is loaded or start new training if the model is blank/new.

    Args:
        json_path (Union[str, list]):
            Path/Paths to a MedCATtrainer export containing the
            meta_annotations we want to train for.
        save_dir_path (Optional[str]):
            In case we have aut_save_model (meaning during the
            training the best model will be saved) we need to
            set a save path. Defaults to `None`.
        data_oversampled (Optional[list]):
            In case of oversampling being performed, the data
            will be passed in the parameter allowing the
            model to be trained on original + synthetic data.
        overwrite (bool):
            Whether to allow overwriting the file if/when appropriate.

    Returns:
        dict: The resulting report.
    """

    # Load the medcattrainer export
    if isinstance(json_path, str):
        json_path = [json_path]

    def merge_data_loaded(base, other):
        if not base:
            return other
        elif other is None:
            return base
        else:
            for p in other['projects']:
                base['projects'].append(p)
        return base

    # Merge data from all different data paths
    data_loaded: dict = {}
    for path in json_path:
        with open(path, 'r') as f:
            data_loaded = merge_data_loaded(data_loaded, json.load(f))
    return self.train_raw(data_loaded, save_dir_path,
                          data_oversampled=data_oversampled,
                          overwrite=overwrite)

train_raw

train_raw(data_loaded: dict, save_dir_path: Optional[str] = None, data_oversampled: Optional[list] = None, overwrite: bool = False) -> dict

Train or continue training a model given raw data. It will continue training if an existing model is loaded or start new training if the model is blank/new.

The raw data is expected in the following format: { 'projects': [ # list of projects { 'name': '', 'documents': [ # list of documents { 'name': '', 'text': '', 'annotations': [ # list of annotations { # start index of the annotation 'start': -1, 'end': 1, # end index of the annotation 'cui': 'cui', 'value': '' }, ... ], }, ... ] }, ... ] }

Parameters:

  • data_loaded

    (dict) –

    The raw data we want to train for.

  • save_dir_path

    (Optional[str], default: None ) –

    In case we have aut_save_model (meaning during the training the best model will be saved) we need to set a save path. Defaults to None.

  • data_oversampled

    (Optional[list], default: None ) –

    In case of oversampling being performed, the data will be passed in the parameter allowing the model to be trained on original + synthetic data. The format of which is expected: [[['text','of','the','document'], [index of medical entity], "label" ], ['text','of','the','document'], [index of medical entity], "label" ]]

  • overwrite

    (bool, default: False ) –

    Whether to allow overwriting the file if/when appropriate.

Returns:

  • dict ( dict ) –

    The resulting report.

Raises:

  • Exception –

    If no save path is specified, or category name not in data.

  • AssertionError –

    If no tokeniser is set

  • FileNotFoundError –

    If phase_number is set to 2 and model.dat file is not found

  • KeyError –

    If phase_number is set to 2 and model.dat file contains mismatched architecture

Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
def train_raw(self, data_loaded: dict, save_dir_path: Optional[str] = None,
              data_oversampled: Optional[list] = None,
              overwrite: bool = False) -> dict:
    """
    Train or continue training a model given raw data. It will continue
    training if an existing model is loaded or start new training if
    the model is blank/new.

    The raw data is expected in the following format:
    {
        'projects': [  # list of projects
            {
                'name': '<project_name>',
                'documents': [  # list of documents
                    {
                        'name': '<document_name>',
                        'text': '<text_of_document>',
                        'annotations': [  # list of annotations
                            {
                                # start index of the annotation
                                'start': -1,
                                'end': 1,    # end index of the annotation
                                'cui': 'cui',
                                'value': '<annotation_value>'
                            },
                            ...
                        ],
                    },
                    ...
                ]
            },
            ...
        ]
    }

    Args:
        data_loaded (dict):
            The raw data we want to train for.
        save_dir_path (Optional[str]):
            In case we have aut_save_model (meaning during the training
            the best model will be saved) we need to set a save path.
            Defaults to `None`.
        data_oversampled (Optional[list]):
            In case of oversampling being performed, the data will be
            passed in the parameter allowing the model to be trained on
            original + synthetic data. The format of which is expected:
            [[['text','of','the','document'], [index of medical entity],
                "label" ],
            ['text','of','the','document'], [index of medical entity],
                "label" ]]
        overwrite (bool):
            Whether to allow overwriting the file if/when appropriate.

    Returns:
        dict: The resulting report.

    Raises:
        Exception: If no save path is specified, or category name
            not in data.
        AssertionError: If no tokeniser is set
        FileNotFoundError: If phase_number is set to 2 and model.dat
            file is not found
        KeyError: If phase_number is set to 2 and model.dat file
            contains mismatched architecture
    """
    g_config = self.config.general
    t_config = self.config.train

    # Create directories if they don't exist
    if t_config.auto_save_model:
        if save_dir_path is None:
            raise Exception("The `save_dir_path` argument is required if "
                            "`aut_save_model` is `True` in the config")
        else:
            os.makedirs(save_dir_path, exist_ok=True)

    # Prepare the data
    assert self.tokenizer is not None
    data_in = prepare_from_json(
        data_loaded, g_config.cntx_left, g_config.cntx_right,
        self.tokenizer, cui_filter=t_config.cui_filter,
        replace_center=g_config.replace_center,
        prerequisites=t_config.prerequisites, lowercase=g_config.lowercase)

    # Check is the name present
    category_name = g_config.get_applicable_category_name(
        data_in)
    if category_name is None:
        in_cat_name = g_config.category_name
        raise Exception(
            "The category name does not exist in this json file. "
            f"You've provided '{in_cat_name}', while the possible "
            f"options are: {' | '.join(list(data_in.keys()))}. "
            "Additionally, ensure the populate the "
            "'alternative_category_names' attribute to accommodate "
            "for variations.")

    data = data_in[category_name]
    if data_oversampled:
        data_sampled = prepare_for_oversampled_data(
            data_oversampled, self.tokenizer)
        data = data + data_sampled

    category_value2id = g_config.category_value2id
    if not category_value2id:
        # Encode the category values
        (full_data, data_undersampled,
         category_value2id) = encode_category_values(
             data, config=self.config,
             alternative_class_names=g_config.alternative_class_names)
    else:
        # We already have everything, just get the data
        (full_data, data_undersampled,
         category_value2id) = encode_category_values(
             data, existing_category_value2id=category_value2id,
             config=self.config,
             alternative_class_names=g_config.alternative_class_names)
        g_config.category_value2id = category_value2id
        self.config.model.nclasses = len(category_value2id)
    # Make sure the config number of classes is the same
    # as the one found in the data
    if len(category_value2id) != self.config.model.nclasses:
        logger.warning(
            "The number of classes set in the config is not the same as "
            f"the one found in the data: {self.config.model.nclasses} vs "
            f"{len(category_value2id)}")
        logger.warning("Auto-setting the nclasses value in config and "
                       "rebuilding the model.")
        self.config.model.nclasses = len(category_value2id)

    if self.config.model.phase_number == 2 and save_dir_path is not None:
        model_save_path = os.path.join(save_dir_path, 'model.dat')
        device = torch.device(g_config.device)
        try:
            self.model.load_state_dict(torch.load(
                model_save_path, map_location=device))
            logger.info("Training model for Phase 2, with model dict "
                        "loaded from disk")
        except FileNotFoundError:
            raise FileNotFoundError(
                f"\nError: Model file not found at path: {model_save_path}"
                "\nPlease run phase 1 training and then run phase 2.")

        except KeyError:
            raise KeyError(
                "\nError: Missing key in loaded state dictionary. "
                "\nThis might be due to a mismatch between the model "
                "architecture and the saved state.")

        except Exception as e:
            raise Exception(
                f"\nError: Model state cannot be loaded from dict. {e}")

    data = full_data
    if self.config.model.phase_number == 1:
        data = data_undersampled
        if not t_config.auto_save_model:
            logger.info("For phase 1, model state has to be saved. "
                        "Saving model...")
            t_config.auto_save_model = True
        logger.info("Training model for Phase 1 now...")

    report = train_model(self.model, data=data, config=self.config,
                         save_dir_path=save_dir_path)

    # If autosave, then load the best model here
    if t_config.auto_save_model:
        if save_dir_path is None:
            raise Exception("The `save_dir_path` argument is required if "
                            "`aut_save_model` is `True` in the config")
        else:
            path = os.path.join(save_dir_path, 'model.dat')
            device = torch.device(g_config.device)
            self.model.load_state_dict(torch.load(
                path, map_location=device))

            # Save everything now
            serialise(self.config.general.serialiser, self, save_dir_path,
                      overwrite=overwrite)

    self.config.train.last_train_on = datetime.now().timestamp()
    return report

MetaCATAddon

MetaCATAddon(config: ConfigMetaCAT, base_tokenizer: BaseTokenizer, meta_cat: Optional[MetaCAT])

Bases: AddonComponent

Methods:

Attributes:

Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
69
70
71
72
73
74
def __init__(self, config: ConfigMetaCAT, base_tokenizer: BaseTokenizer,
             meta_cat: Optional['MetaCAT']) -> None:
    self.config = config
    self._mc = meta_cat
    self._name = config.general.category_name
    self._init_data_paths(base_tokenizer)

DEFAULT_TOKENIZER class-attribute instance-attribute

DEFAULT_TOKENIZER = 'spacy'

addon_type class-attribute instance-attribute

addon_type = 'meta_cat'

config instance-attribute

config: ConfigMetaCAT = config

include_in_output property

include_in_output: bool

mc property

mc: MetaCAT

name property

name: str

output_key class-attribute instance-attribute

output_key = 'meta_anns'

create_new classmethod

create_new(config: ConfigMetaCAT, base_tokenizer: BaseTokenizer, tknzer_preprocessor: TokenizerPreprocessor = None) -> MetaCATAddon

Factory method to create a new MetaCATAddon instance.

Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
82
83
84
85
86
87
88
89
90
91
@classmethod
def create_new(cls, config: ConfigMetaCAT, base_tokenizer: BaseTokenizer,
               tknzer_preprocessor: TokenizerPreprocessor = None
               ) -> 'MetaCATAddon':
    """Factory method to create a new MetaCATAddon instance."""
    tokenizer = init_tokenizer(config)
    if tknzer_preprocessor is not None:
        tknzer_preprocessor(tokenizer)
    meta_cat = MetaCAT(tokenizer, embeddings=None, config=config)
    return cls(config, base_tokenizer, meta_cat)

create_new_component classmethod

create_new_component(cnf: ComponentConfig, tokenizer: BaseTokenizer, cdb: CDB, vocab: Vocab, model_load_path: Optional[str]) -> MetaCATAddon
Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
@classmethod
def create_new_component(
        cls, cnf: ComponentConfig, tokenizer: BaseTokenizer,
        cdb: CDB, vocab: Vocab, model_load_path: Optional[str]
        ) -> 'MetaCATAddon':
    if not isinstance(cnf, ConfigMetaCAT):
        raise ValueError(f"Incompatible config: {cnf}")
    if model_load_path is not None:
        components_folder = os.path.join(
            model_load_path, COMPONENTS_FOLDER)
        folder_name = cls.get_folder_name_for_addon_and_name(
            cls.addon_type, str(cnf.general.category_name))
        load_path = os.path.join(components_folder, folder_name)
        return cls.load_existing(cnf, tokenizer, load_path)
    # TODO: tokenizer preprocessing for (e.g) BPE tokenizer (see PR #67)
    return cls.create_new(cnf, tokenizer, None)

deserialise_from classmethod

deserialise_from(folder_path: str, **init_kwargs) -> MetaCATAddon
Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
@classmethod
def deserialise_from(cls, folder_path: str, **init_kwargs
                     ) -> 'MetaCATAddon':
    if "model.dat" in os.listdir(folder_path):
        if not avoid_legacy_conversion():
            doing_legacy_conversion_message(
                logger, cls.__name__, folder_path)
            from medcat.utils.legacy.convert_meta_cat import (
                get_meta_cat_from_old)
            return get_meta_cat_from_old(
                folder_path, cls._create_throwaway_tokenizer())
        raise LegacyConversionDisabledError(cls.__name__,)
    if 'cnf' in init_kwargs:
        cnf = init_kwargs['cnf']
    else:
        config_path = os.path.join(folder_path, "meta_cat", "config")
        if not os.path.exists(config_path):
            # load legacy config (assuming it exists)
            config_path += ".dat"
        logger.info(
            "Was not provide a config when loading a meta cat from '%s'. "
            "Inferring config from file at '%s'", folder_path,
            config_path)
        cnf = ConfigMetaCAT.load(config_path)
    if 'model_config' in init_kwargs:
        cnf.merge_config(init_kwargs['model_config'])
    if 'tokenizer' in init_kwargs:
        tokenizer = init_kwargs['tokenizer']
    else:
        tokenizer = cls._create_throwaway_tokenizer()
    return cls.load_existing(
        load_path=folder_path,
        cnf=cnf,
        base_tokenizer=tokenizer)

get_hash

get_hash() -> str
Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
256
257
258
259
260
def get_hash(self) -> str:
    if self._mc:
        return self._mc.get_hash()
    else:
        return 'No-model'

get_init_attrs classmethod

get_init_attrs() -> list[str]
Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
244
245
246
@classmethod
def get_init_attrs(cls) -> list[str]:
    return []

get_output_key_val

get_output_key_val(ent: MutableEntity) -> tuple[str, dict[str, MetaAnnotationValue]]
Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
172
173
174
175
176
177
def get_output_key_val(self, ent: MutableEntity
                       ) -> tuple[str, dict[str, MetaAnnotationValue]]:
    # NOTE: In case of multiple MetaCATs, this will be called
    #       once for each MetaCAT and will get the same value.
    #       But it shouldn't be too much of an issue.
    return self.output_key, ent.get_addon_data(_META_ANNS_PATH)

get_strategy

get_strategy() -> SerialisingStrategy
Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
241
242
def get_strategy(self) -> SerialisingStrategy:
    return SerialisingStrategy.MANUAL

ignore_attrs classmethod

ignore_attrs() -> list[str]
Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
248
249
250
@classmethod
def ignore_attrs(cls) -> list[str]:
    return []

include_properties classmethod

include_properties() -> list[str]
Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
252
253
254
@classmethod
def include_properties(cls) -> list[str]:
    return []

load

load(folder_path: str) -> MetaCAT
Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
126
127
128
129
130
131
def load(self, folder_path: str) -> 'MetaCAT':
    mc_path, tokenizer_folder = self._get_meta_cat_and_tokenizer_paths(
        folder_path)
    mc = cast(MetaCAT, deserialise(mc_path, save_dir_path=folder_path))
    mc.tokenizer = self._load_tokenizer(self.config, tokenizer_folder)
    return mc

load_existing classmethod

load_existing(cnf: ConfigMetaCAT, base_tokenizer: BaseTokenizer, load_path: str) -> MetaCATAddon

Factory method to load an existing MetaCATAddon from disk.

Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
110
111
112
113
114
115
116
117
@classmethod
def load_existing(cls, cnf: ConfigMetaCAT,
                  base_tokenizer: BaseTokenizer,
                  load_path: str) -> 'MetaCATAddon':
    """Factory method to load an existing MetaCATAddon from disk."""
    meta_cat = cls(cnf, base_tokenizer, None)  # Temporary instance
    meta_cat._mc = meta_cat.load(load_path)
    return meta_cat

save

save(folder_path: str) -> None
Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
def save(self, folder_path: str) -> None:
    mc_path, tokenizer_folder = self._get_meta_cat_and_tokenizer_paths(
        folder_path)
    os.mkdir(mc_path)
    os.mkdir(tokenizer_folder)
    serialise(self.config.general.serialiser, self.mc, mc_path)
    if self.mc.tokenizer is None:
        raise MisconfiguredMetaCATException(
            "Unable to save MetaCAT without a tokenizer")
    self.mc.tokenizer.save(tokenizer_folder)
    if self.config.model.model_name == 'bert':
        model_config_save_path = os.path.join(
            folder_path, 'bert_config.json')
        self._mc.model.bert_config.to_json_file(  # type: ignore
            model_config_save_path)

serialise_to

serialise_to(folder_path: str) -> None
Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
181
182
183
def serialise_to(self, folder_path: str) -> None:
    os.mkdir(folder_path)
    self.save(folder_path)

get_meta_annotations

get_meta_annotations(entity: MutableEntity) -> dict[str, MetaAnnotationValue]
Source code in medcat-v2/medcat/components/addons/meta_cat/meta_cat.py
263
264
265
def get_meta_annotations(entity: MutableEntity
                         ) -> dict[str, MetaAnnotationValue]:
    return entity.get_addon_data(_META_ANNS_PATH)