Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add summarize CLI #343

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/pyobo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
get_object_properties,
get_object_properties_df,
get_obsolete,
get_prefixes,
get_primary_curie,
get_primary_identifier,
get_properties,
Expand Down Expand Up @@ -117,6 +118,7 @@
"get_object_properties_df",
"get_obsolete",
"get_ontology",
"get_prefixes",
"get_primary_curie",
"get_primary_identifier",
"get_properties",
Expand Down
3 changes: 2 additions & 1 deletion src/pyobo/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
has_ancestor,
is_descendent,
)
from .metadata import get_metadata
from .metadata import get_metadata, get_prefixes
from .names import (
get_definition,
get_id_definition_mapping,
Expand Down Expand Up @@ -97,6 +97,7 @@
"get_object_properties_df",
"get_obsolete",
"get_ontology",
"get_prefixes",
"get_primary_curie",
"get_primary_identifier",
"get_priority_curie",
Expand Down
18 changes: 18 additions & 0 deletions src/pyobo/api/metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""High-level API for metadata."""

import logging
from collections import Counter
from functools import lru_cache
from typing import Any, cast

Expand All @@ -15,6 +16,8 @@

__all__ = [
"get_metadata",
"get_prefixes",
"get_references_to",
]

logger = logging.getLogger(__name__)
Expand All @@ -33,3 +36,18 @@ def _get_json() -> dict[str, Any]:
return ontology.get_metadata()

return cast(dict[str, Any], _get_json())


def get_prefixes(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> Counter[str]:
"""Count the number of unique references to each vocabulary appear in the ontology."""
ontology = get_ontology(prefix, **kwargs)
return Counter({k: len(values) for k, values in ontology._get_references().items()})


def get_references_to(
prefix: str, target_prefix: str, **kwargs: Unpack[GetOntologyKwargs]
) -> dict[str, int]:
"""Count the number of unique references to each vocabulary appear in the ontology."""
ontology = get_ontology(prefix, **kwargs)
references = ontology._get_references().get(target_prefix, Counter())
return {r.identifier: count for r, count in references.items()}
47 changes: 47 additions & 0 deletions src/pyobo/cli/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import warnings
from pathlib import Path

import bioregistry
import click
from more_click import verbose_option
from tqdm.contrib.logging import logging_redirect_tqdm
Expand All @@ -14,9 +15,11 @@
_iter_alts,
_iter_definitions,
_iter_edges,
_iter_external_counts,
_iter_mappings,
_iter_metadata,
_iter_names,
_iter_prefix_count,
_iter_properties,
_iter_relations,
_iter_species,
Expand Down Expand Up @@ -354,6 +357,50 @@ def mappings(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) ->
raise NotImplementedError("need to do initial manual upload of SSSOM build")


@database_annotate
def prefixes(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
"""Make the prefix count."""
columns = [
"prefix",
"external",
"count",
]
with logging_redirect_tqdm():
it = _iter_prefix_count(**kwargs)
db_output_helper(
it,
"prefixes",
columns,
directory=directory,
)
if zenodo:
raise NotImplementedError("need to do initial manual upload of prefixes build")


@database_annotate
@click.option("--ext", required=True)
def usages(zenodo: bool, directory: Path, ext: str, **kwargs: Unpack[DatabaseKwargs]) -> None:
"""Make the prefix count."""
norm_ext = bioregistry.normalize_prefix(ext)
if norm_ext is None:
raise ValueError
columns = [
"prefix",
norm_ext,
"count",
]
with logging_redirect_tqdm():
it = _iter_external_counts(target_prefix=norm_ext, **kwargs)
db_output_helper(
it,
f"{norm_ext}-usages",
columns,
directory=directory,
)
if zenodo:
raise NotImplementedError


if __name__ == "__main__":
logging.captureWarnings(True)
with logging_redirect_tqdm():
Expand Down
26 changes: 26 additions & 0 deletions src/pyobo/cli/database_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import functools
import gzip
import logging
import warnings
Expand All @@ -22,11 +23,13 @@
get_id_to_alts,
get_mappings_df,
get_metadata,
get_prefixes,
get_properties_df,
get_relations_df,
get_typedef_df,
get_xrefs_df,
)
from ..api.metadata import get_references_to
from ..getters import IterHelperHelperDict, iter_helper, iter_helper_helper
from ..sources import pubchem
from ..sources.ncbi import ncbigene
Expand Down Expand Up @@ -163,3 +166,26 @@ def _iter_mappings(
it = iter_helper_helper(f, **kwargs)
for _prefix, df in it:
yield from df.values


def _iter_prefix_count(**kwargs: Unpack[IterHelperHelperDict]) -> Iterable[tuple[str, str, str]]:
"""Iterate over all prefix-external prefix-count triples.

:param leave: should the tqdm be left behind?
"""
for prefix, external_prefix, count in iter_helper(get_prefixes, **kwargs):
if prefix != external_prefix:
yield prefix, external_prefix, str(count)


def _iter_external_counts(
target_prefix: str, **kwargs: Unpack[IterHelperHelperDict]
) -> Iterable[tuple[str, str, str]]:
"""Iterate over all prefix-external pairs.

:param leave: should the tqdm be left behind?
"""
f = functools.partial(get_references_to, target_prefix=target_prefix)
f.__name__ = f'get_references_to(target_prefix="{target_prefix}")' # type:ignore
for prefix, external_identifier, count in iter_helper(f, **kwargs):
yield prefix, external_identifier, str(count)
17 changes: 17 additions & 0 deletions src/pyobo/cli/lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import bioregistry
import click
from more_click import verbose_option
from tqdm.contrib.logging import logging_redirect_tqdm
from typing_extensions import Unpack

from .utils import (
Expand Down Expand Up @@ -42,6 +43,7 @@
from ..constants import LookupKwargs
from ..getters import get_ontology
from ..struct import Reference
from ..struct.reference import get_preferred_curie

__all__ = [
"lookup",
Expand Down Expand Up @@ -300,3 +302,18 @@ def prefixes(**kwargs: Unpack[LookupKwargs]) -> None:
ontology = get_ontology(**kwargs)
for prefix in sorted(ontology._get_prefixes(), key=str.casefold):
click.echo(prefix)


@lookup_annotate
def usage(**kwargs: Unpack[LookupKwargs]) -> None:
"""Page through prefixes appearing in an ontology."""
from tabulate import tabulate

with logging_redirect_tqdm():
ontology = get_ontology(**kwargs)
references = ontology._get_references()
rows = [
(prefix, len(counter), sum(counter.values()), get_preferred_curie(min(counter)))
for prefix, counter in sorted(references.items())
]
click.echo(tabulate(rows, headers=["prefix", "unique", "total", "example"]))
57 changes: 34 additions & 23 deletions src/pyobo/struct/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import os
import sys
import warnings
from collections import ChainMap, defaultdict
from collections import ChainMap, Counter, defaultdict
from collections.abc import Callable, Collection, Iterable, Iterator, Mapping, Sequence
from dataclasses import dataclass, field
from pathlib import Path
Expand Down Expand Up @@ -52,9 +52,9 @@
StanzaType,
UnionOfHint,
_chain_tag,
_count_references_from_annotations,
_ensure_ref,
_get_prefixes_from_annotations,
_get_references_from_annotations,
_tag_property_targets,
)
from .utils import _boolean_tag, obo_escape_slim
Expand Down Expand Up @@ -131,19 +131,26 @@ def __lt__(self, other: Synonym) -> bool:
"""Sort lexically by name."""
return self._sort_key() < other._sort_key()

def _get_references(self) -> defaultdict[str, set[Reference]]:
def _get_references(self) -> defaultdict[str, Counter[Reference]]:
"""Get all prefixes used by the typedef."""
rv: defaultdict[str, set[Reference]] = defaultdict(set)
rv[v.has_dbxref.prefix].add(v.has_dbxref)
rv: defaultdict[str, Counter[Reference]] = defaultdict(Counter)

def _add(r: Reference) -> None:
rv[r.prefix][r] += 1

if self.type is not None:
rv[self.type.prefix].add(self.type)
_add(self.type)
rv[self.type.prefix][self.type] += 1
if self.provenance:
_add(v.has_dbxref)

for provenance in self.provenance:
match provenance:
case Reference():
rv[provenance.prefix].add(provenance)
_add(provenance)
case OBOLiteral(_, datatype, _language):
rv[datatype.prefix].add(v._c(datatype))
for prefix, references in _get_references_from_annotations(self.annotations).items():
_add(v._c(datatype))
for prefix, references in _count_references_from_annotations(self.annotations).items():
rv[prefix].update(references)
return rv

Expand Down Expand Up @@ -227,15 +234,15 @@ def to_obo(self, ontology_prefix: str) -> str:
rv = f"{rv} {self.specificity}"
return rv

def _get_references(self) -> dict[str, set[Reference]]:
def _get_references(self) -> dict[str, Counter[Reference]]:
"""Get all references used by the typedef."""
rv: defaultdict[str, set[Reference]] = defaultdict(set)
rv[self.reference.prefix].add(self.reference)
rv: defaultdict[str, Counter[Reference]] = defaultdict(Counter)
rv[self.reference.prefix][self.reference] += 1
if self.specificity is not None:
# weird syntax, but this just gets the synonym scope
# predicate as a pyobo reference
r = v._c(_cv.synonym_scopes[self.specificity])
rv[r.prefix].add(r)
rv[r.prefix][r] += 1
return dict(rv)


Expand Down Expand Up @@ -705,23 +712,27 @@ def _get_prefixes(self) -> set[str]:
prefixes.add("oboInOwl")
return prefixes

def _get_references(self) -> dict[str, set[Reference]]:
def _get_references(self) -> dict[str, Counter[Reference]]:
"""Get all references used by the ontology."""
rv: defaultdict[str, set[Reference]] = defaultdict(set)
rv: defaultdict[str, Counter[Reference]] = defaultdict(Counter)

def _add(r: Reference) -> None:
rv[r.prefix][r] += 1

for rr in itt.chain(self, self.typedefs or [], self.synonym_typedefs or []):
for prefix, references in rr._get_references().items():
rv[prefix].update(references)
for prefix, counter in rr._get_references().items():
rv[prefix].update(counter)

for subset, _ in self.subsetdefs or []:
rv[subset.prefix].add(subset)
_add(subset)
# _iterate_property_pairs covers metadata, root terms,
# and properties in self.property_values
for prefix, references in _get_references_from_annotations(
for prefix, counter in _count_references_from_annotations(
self._iterate_property_pairs()
).items():
rv[prefix].update(references)
rv[prefix].update(counter)
if self.auto_generated_by:
rv[v.obo_autogenerated_by.prefix].add(v.obo_autogenerated_by)
_add(v.obo_autogenerated_by)
return dict(rv)

def _get_version(self) -> str | None:
Expand Down Expand Up @@ -2066,11 +2077,11 @@ def __hash__(self) -> int:
# have to re-define hash because of the @dataclass
return hash((self.__class__, self.prefix, self.identifier))

def _get_references(self) -> dict[str, set[Reference]]:
def _get_references(self) -> dict[str, Counter[Reference]]:
rv = super()._get_references()

def _add(r: Reference) -> None:
rv[r.prefix].add(r)
rv[r.prefix][r] += 1

if self.domain:
_add(self.domain)
Expand Down
Loading