Source code for biomappings.gilda_utils

# -*- coding: utf-8 -*-

"""Utilities for generating predictions with pyobo/gilda."""

import logging
from collections import defaultdict
from pathlib import Path
from typing import Iterable, Optional, Tuple, Union

import bioregistry
import pyobo
import pyobo.gilda_utils
from gilda.grounder import Grounder

from biomappings.resources import PredictionTuple, append_prediction_tuples
from biomappings.utils import CMapping

__all__ = [
    "append_gilda_predictions",
    "iter_prediction_tuples",
    "filter_custom",
    "filter_existing_xrefs",
    "has_mapping",
]

logger = logging.getLogger(__name__)


[docs] def append_gilda_predictions( prefix: str, target_prefixes: Union[str, Iterable[str]], provenance: str, *, relation: str = "skos:exactMatch", custom_filter: Optional[CMapping] = None, unnamed: Optional[Iterable[str]] = None, identifiers_are_names: bool = False, path: Optional[Path] = None, ) -> None: """Add gilda predictions to the Biomappings predictions.tsv file. :param prefix: The source prefix :param target_prefixes: The target prefix or prefixes :param provenance: The provenance text. Typically generated with ``biomappings.utils.get_script_url(__file__)``. :param relation: The relationship. Defaults to ``skos:exactMatch``. :param custom_filter: A triple nested dictionary from source prefix to target prefix to source id to target id. Any source prefix, target prefix, source id combinations in this dictionary will be filtered. :param unnamed: An optional list of prefixes whose identifiers should be considered as names (e.g., CCLE, FPLX) :param identifiers_are_names: The source prefix's identifiers should be considered as names :param path: A custom path to predictions TSV file """ if isinstance(target_prefixes, str): target_prefixes = [target_prefixes] grounder = pyobo.gilda_utils.get_grounder(target_prefixes, unnamed=unnamed) predictions = iter_prediction_tuples( prefix, relation=relation, grounder=grounder, provenance=provenance, identifiers_are_names=identifiers_are_names, ) if custom_filter is not None: predictions = filter_custom(predictions, custom_filter) predictions = filter_existing_xrefs(predictions, [prefix, *target_prefixes]) predictions = sorted(predictions, key=_key) append_prediction_tuples(predictions, path=path)
[docs] def iter_prediction_tuples( prefix: str, provenance: str, *, relation: str = "skos:exactMatch", grounder: Optional[Grounder] = None, identifiers_are_names: bool = False, ) -> Iterable[PredictionTuple]: """Iterate over prediction tuples for a given prefix.""" for t in pyobo.gilda_utils.iter_gilda_prediction_tuples( prefix=prefix, relation=relation, grounder=grounder, identifiers_are_names=identifiers_are_names, ): yield PredictionTuple(*t, provenance) # type: ignore
[docs] def filter_custom( predictions: Iterable[PredictionTuple], custom_filter: CMapping, ) -> Iterable[PredictionTuple]: """Filter out custom mappings.""" counter = 0 for p in predictions: if custom_filter.get(p.source_prefix, {}).get(p.target_prefix, {}).get(p.source_id): counter += 1 continue yield p logger.info("filtered out %d custom mapped matches", counter)
[docs] def filter_existing_xrefs( predictions: Iterable[PredictionTuple], prefixes: Iterable[str] ) -> Iterable[PredictionTuple]: """Filter predictions that match xrefs already loaded through PyOBO.""" prefixes = set(prefixes) entity_to_mapped_prefixes = defaultdict(set) for prefix in prefixes: for source_id, target_prefix, target_id in pyobo.get_xrefs_df(prefix).values: entity_to_mapped_prefixes[prefix, source_id].add(target_prefix) entity_to_mapped_prefixes[target_prefix, target_id].add(prefix) counter = 0 for prediction in predictions: source_id = bioregistry.standardize_identifier( prediction.source_prefix, prediction.source_id ) target_id = bioregistry.standardize_identifier( prediction.target_prefix, prediction.target_identifier ) if ( prediction.target_prefix in entity_to_mapped_prefixes[prediction.source_prefix, source_id] or prediction.source_prefix in entity_to_mapped_prefixes[prediction.target_prefix, target_id] ): counter += 1 continue yield prediction logger.info("filtered out %d pre-mapped matches", counter)
[docs] def has_mapping(prefix: str, identifier: str, target_prefix: str) -> bool: """Check if there's already a mapping available for this entity in a target namespace.""" return pyobo.get_xref(prefix, identifier, target_prefix) is not None
def _key(t: PredictionTuple) -> Tuple[str, str]: return t.source_prefix, t.source_name