Source code for biomappings.testing

# -*- coding: utf-8 -*-

"""Validation tests for :mod:`biomappings`."""

import itertools as itt
import unittest
from collections import defaultdict
from pathlib import Path
from typing import ClassVar, Union

import bioregistry

from biomappings.resources import (
    CURATORS_PATH,
    Mappings,
    MappingTuple,
    PredictionTuple,
    load_curators,
    load_mappings,
    load_predictions,
    mapping_sort_key,
)
from biomappings.resources.semapv import get_semapv
from biomappings.utils import (
    InvalidIdentifierPattern,
    InvalidNormIdentifier,
    check_valid_prefix_id,
    get_canonical_tuple,
)

__all__ = [
    "IntegrityTestCase",
    "PathIntegrityTestCase",
]

semapv = get_semapv()


def _extract_redundant(counter):
    return [(key, values) for key, values in counter.items() if len(values) > 1]


def _locations_str(locations):
    return ", ".join(f"{label}:{line}" for label, line in locations)



[docs]
class IntegrityTestCase(unittest.TestCase):
    """Data integrity tests."""

    mappings: Mappings
    predictions: Mappings
    incorrect: Mappings
    unsure: Mappings

    def _iter_groups(self):
        for group, label in [
            (self.mappings, "positive"),
            (self.incorrect, "negative"),
            (self.predictions, "predictions"),
            (self.unsure, "unsure"),
        ]:
            for i, mapping in enumerate(group, start=2):
                yield label, i, mapping


[docs]
    def test_prediction_types(self):
        """Test that the prediction type is pulled in properly."""
        for line, mapping in enumerate(self.mappings, start=2):
            pt = mapping.get("prediction_type", "".strip())
            if not pt:
                continue
            self.assertTrue(
                pt.startswith("semapv:"),
                msg=f"Prediction type should be annotated with semapv on line {line}",
            )
            self.assertIn(pt[len("semapv:") :], semapv)
            self.assertNotEqual(
                "semapv:ManualMappingCuration",
                pt,
                msg="Prediction can not be annotated with manual curation",
            )

        for label, line, mapping in self._iter_groups():
            tt = mapping["type"]
            self.assertTrue(
                tt.startswith("semapv:"),
                msg=f"[{label}] The 'type' column should be annotated with semapv on line {line}",
            )
            self.assertIn(tt[len("semapv:") :], semapv)



[docs]
    def test_relations(self):
        """Test that the relation is a CURIE."""
        for label, line, mapping in self._iter_groups():
            parts = mapping["relation"].split(":")
            self.assertEqual(2, len(parts))
            prefix, identifier = parts
            self.assertNotEqual("ro", prefix, msg="RO should be capitalized")
            if prefix != "RO":
                self.assert_canonical_identifier(prefix, identifier, label, line)



[docs]
    def test_canonical_prefixes(self):
        """Test that all mappings use canonical bioregistry prefixes."""
        valid_prefixes = set(bioregistry.read_registry())
        for label, line, mapping in self._iter_groups():
            source_prefix, target_prefix = mapping["source prefix"], mapping["target prefix"]
            self.assertIn(
                source_prefix,
                valid_prefixes,
                msg=f"Invalid prefix: {source_prefix} on {label}:{line}",
            )
            self.assertIn(
                target_prefix,
                valid_prefixes,
                msg=f"Invalid prefix: {target_prefix} on {label}:{line}",
            )



[docs]
    def test_normalized_identifiers(self):
        """Test that all identifiers have been normalized (based on bioregistry definition)."""
        for label, line, mapping in self._iter_groups():
            self.assert_canonical_identifier(
                mapping["source prefix"], mapping["source identifier"], label, line
            )
            self.assert_canonical_identifier(
                mapping["target prefix"], mapping["target identifier"], label, line
            )



[docs]
    def assert_canonical_identifier(
        self, prefix: str, identifier: str, label: str, line: int
    ) -> None:
        """Assert a given identifier is canonical.

        :param prefix: The prefix to check
        :param identifier: The identifier in the semantic space for the prefix
        :param label: The label of the mapping file
        :param line: The line number of the mapping
        """
        try:
            check_valid_prefix_id(prefix, identifier)
        except InvalidNormIdentifier as e:
            self.fail(f"[{label}:{line}] {e}")
        except InvalidIdentifierPattern as e:
            self.fail(f"[{label}:{line}] {e}")



[docs]
    def test_contributors(self):
        """Test all contributors have an entry in the curators.tsv file."""
        contributor_orcids = {row["orcid"] for row in load_curators()}
        for mapping in itt.chain(self.mappings, self.incorrect, self.unsure):
            source = mapping["source"]
            if not source.startswith("orcid:"):
                self.assertTrue(source.startswith("web-"))
                ss = source[len("web-") :]
                self.fail(msg=f'Add an entry with "{ss}" and your ORCID to {CURATORS_PATH}')
            self.assertIn(source[len("orcid:") :], contributor_orcids)



[docs]
    def test_cross_redundancy(self):
        """Test the redundancy of manually curated mappings and predicted mappings."""
        counter = defaultdict(lambda: defaultdict(list))
        for label, line, mapping in self._iter_groups():
            counter[get_canonical_tuple(mapping)][label].append(line)

        redundant = []
        for mapping, label_to_lines in counter.items():
            if len(label_to_lines) <= 1:
                continue
            redundant.append((mapping, sorted(label_to_lines.items())))

        if redundant:
            msg = "".join(
                f"\n  {mapping}: {_locations_str(locations)}" for mapping, locations in redundant
            )
            raise ValueError(f"{len(redundant)} are redundant: {msg}")



[docs]
    def assert_no_internal_redundancies(self, m: Mappings, tuple_cls):
        """Assert that the list of mappings doesn't have any redundancies."""
        counter = defaultdict(list)
        for line, mapping in enumerate(m, start=1):
            counter[tuple_cls.from_dict(mapping)].append(line)
        redundant = _extract_redundant(counter)
        if redundant:
            msg = "".join(
                f"\n  {mapping.source_curie}/{mapping.target_curie}: {locations}"
                for mapping, locations in redundant
            )
            raise ValueError(f"{len(redundant)} are redundant: {msg}")



[docs]
    def test_predictions_sorted(self):
        """Test the predictions are in a canonical order."""
        self.assertEqual(
            self.predictions,
            sorted(self.predictions, key=mapping_sort_key),
            msg="Predictions are not sorted",
        )
        self.assert_no_internal_redundancies(self.predictions, PredictionTuple)



[docs]
    def test_curations_sorted(self):
        """Test the true curated mappings are in a canonical order."""
        self.assertEqual(
            self.mappings,
            sorted(self.mappings, key=mapping_sort_key),
            msg="True curations are not sorted",
        )
        self.assert_no_internal_redundancies(self.mappings, MappingTuple)



[docs]
    def test_false_mappings_sorted(self):
        """Test the false curated mappings are in a canonical order."""
        self.assertEqual(
            self.incorrect,
            sorted(self.incorrect, key=mapping_sort_key),
            msg="False curations are not sorted",
        )
        self.assert_no_internal_redundancies(self.incorrect, MappingTuple)



[docs]
    def test_unsure_sorted(self):
        """Test the unsure mappings are in a canonical order."""
        self.assertEqual(
            self.unsure,
            sorted(self.unsure, key=mapping_sort_key),
            msg="Unsure curations are not sorted",
        )
        self.assert_no_internal_redundancies(self.unsure, MappingTuple)





[docs]
class PathIntegrityTestCase(IntegrityTestCase):
    """A test case that can be configured with paths.

    For example, in this might be used in a custom instance of Biomappings
    like in the following:

    .. code-block:: python

        from biomappings.testing import PathIntegrityTestCase

        HERE = Path(__file__).parent.resolve()

        class TestCustom(PathIntegrityTestCase):
            predictions_path = HERE.joinpath("predictions.tsv")
            positives_path = HERE.joinpath("positive.tsv")
            negatives_path = HERE.joinpath("negative.tsv")
            unsure_path = HERE.joinpath("unsure.tsv")
    """

    predictions_path: ClassVar[Union[str, Path]]
    positives_path: ClassVar[Union[str, Path]]
    negatives_path: ClassVar[Union[str, Path]]
    unsure_path: ClassVar[Union[str, Path]]


[docs]
    @classmethod
    def setUpClass(cls) -> None:
        """Set up the test case."""
        cls.predictions = load_predictions(path=cls.predictions_path)
        cls.mappings = load_mappings(path=cls.positives_path)
        cls.incorrect = load_mappings(path=cls.negatives_path)
        cls.unsure = load_mappings(path=cls.unsure_path)