Skip to content

Commit

Permalink
Merge pull request #71 from christian-monch/crossport-metadata-changes
Browse files Browse the repository at this point in the history
Crossport core-located metadata bug-fixes
  • Loading branch information
yarikoptic authored Oct 21, 2022
2 parents 2d98816 + a9e5f64 commit 6f5a029
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 59 deletions.
12 changes: 6 additions & 6 deletions datalad_deprecated/metadata/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

import datalad
from datalad.consts import DATASET_CONFIG_FILE

from datalad.interface.annotate_paths import _minimal_annotate_paths
from datalad.interface.base import Interface
from datalad.interface.utils import (
eval_results,
Expand Down Expand Up @@ -61,18 +61,18 @@
)
from datalad.core.local.status import get_paths_by_ds

from .annotate_paths import _minimal_annotate_paths
from .consts import DATASET_METADATA_FILE
from .metadata import (
exclude_from_metadata,
get_ds_aggregate_db_locations,
get_metadata_type,
load_ds_aggregate_db,
location_keys,
exclude_from_metadata,
get_metadata_type,
_get_metadata,
_get_metadatarelevant_paths,
_get_containingds_from_agginfo,
location_keys,
)

from .consts import DATASET_METADATA_FILE
from .utils import all_same


Expand Down
65 changes: 34 additions & 31 deletions datalad_deprecated/metadata/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,18 @@
from typing import (
Dict,
List,
Optional,
)

from datalad import cfg
from datalad.interface.annotate_paths import _minimal_annotate_paths
from datalad.interface.base import Interface
from datalad.interface.results import get_status_dict
from datalad.interface.utils import (
eval_results,
generic_result_renderer,
)
from datalad.interface.base import build_doc
from datalad.metadata.definitions import version as vocabulary_version
from datalad.support.collections import ReadOnlyDict, _val2hashable
from datalad.support.constraints import (
EnsureNone,
Expand All @@ -45,10 +46,7 @@
from datalad.support.exceptions import CapturedException
from datalad.support.param import Parameter
import datalad.support.ansi_colors as ac
from datalad.support.json_py import (
load as jsonload,
load_stream,
)
from datalad.support.json_py import load as jsonload
from datalad.interface.common_opts import recursion_flag
from datalad.distribution.dataset import (
Dataset,
Expand All @@ -60,13 +58,23 @@
ensure_list,
path_is_subpath,
path_startswith,

)
from datalad.ui import ui
from datalad.dochelpers import single_or_plural
from datalad.log import log_progress
from datalad.core.local.status import get_paths_by_ds

from .common_opts import reporton_opt
from .consts import (
OLDMETADATA_DIR,
OLDMETADATA_FILENAME,
)
from .utils import (
load_xzstream,
as_unicode,
)


# Check availability of new-generation metadata
try:
from datalad_metalad.dump import Dump
Expand All @@ -81,20 +89,6 @@ def __call__(self, *args, **kwargs):
return []
next_generation_metadata_available = False

from .annotate_paths import _minimal_annotate_paths
from .common_opts import reporton_opt
from .consts import (
OLDMETADATA_DIR,
OLDMETADATA_FILENAME,
)
from .definitions import version as vocabulary_version
from .utils import as_unicode


def load_xzstream(fname):
for o in load_stream(fname, compressed=True):
yield o


lgr = logging.getLogger('datalad.metadata.metadata')

Expand Down Expand Up @@ -352,14 +346,14 @@ def legacy_query_aggregated_metadata(reporton, ds, aps, recursive=False,
def _query_aggregated_metadata_singlepath(
ds, agginfos, agg_base_path, qap, reporton, cache, dsmeta,
contentinfo_objloc):
"""This is the workhorse of query_aggregated_metadata() for querying for a
"""This is the workhorse of legacy_query_aggregated_metadata() for querying for a
single path"""
rpath = qap['rpath']
containing_ds = qap['metaprovider']
qtype = qap.get('type', None)
if (rpath == op.curdir or rpath == containing_ds) and \
((reporton is None and qtype == 'dataset') or \
reporton in ('datasets', 'all')):
if (rpath == op.curdir or rpath == containing_ds) \
and ((reporton is None and qtype == 'dataset')
or reporton in ('datasets', 'all')):
# this is a direct match for a dataset (we only have agginfos for
# datasets) -> prep result
res = get_status_dict(
Expand Down Expand Up @@ -1129,13 +1123,22 @@ def gen4_query_aggregated_metadata(reporton: str,
}
}
except NoMetadataStoreFound:
lgr.warning("Found no gen4-metadata in dataset %s.", dataset.pathobj)
if len(matching_types) == 2:
matching_type = "all"
elif len(matching_types) == 0:
matching_type = "none"
elif len(matching_types) == 1:
matching_type = matching_types[0]
else:
raise RuntimeError(f"Was expecting matching_types with 1 element, got {matching_types}")
yield {
**kwargs,
'path': str(ds.pathobj / relative_path),
'status': 'impossible',
'message': f'Dataset at {ds.pathobj} does not contain gen4 '
f'metadata',
'type': matching_types
'type': matching_type
}

return None
Expand All @@ -1145,9 +1148,9 @@ def query_aggregated_metadata(reporton: str,
ds: Dataset,
aps: List[Dict],
recursive: bool = False,
metadata_source: Optional[str] = None,
metadata_source: str = "legacy",
**kwargs):
"""Query legacy and NG-metadata stored in a dataset or its metadata store
"""Query legacy and gen4-metadata stored in a dataset or its metadata store
Parameters
----------
Expand All @@ -1161,11 +1164,11 @@ def query_aggregated_metadata(reporton: str,
recursive : bool
Whether or not to report metadata underneath all query paths
recursively.
metadata_source : Optional[str]
metadata_source : [str] {'all', 'legacy', 'gen4'}
Metadata source that should be used. If set to "legacy", only metadata
prior metalad version 0.3.0 will be queried, if set to "gen4", only
metadata of metalad version 0.3.0 and beyond will be queried, if set
to 'None', all known metadata will be queried.
to 'all', all known metadata will be queried.
**kwargs
Any other argument will be passed on to the query result dictionary.
Expand All @@ -1175,7 +1178,7 @@ def query_aggregated_metadata(reporton: str,
Of result dictionaries.
"""

if metadata_source in (None, "legacy"):
if metadata_source in ("all", "legacy"):
yield from legacy_query_aggregated_metadata(
reporton=reporton,
ds=ds,
Expand All @@ -1184,7 +1187,7 @@ def query_aggregated_metadata(reporton: str,
**kwargs
)

if metadata_source in (None, "gen4") and next_generation_metadata_available:
if metadata_source in ("all", "gen4") and next_generation_metadata_available:
yield from gen4_query_aggregated_metadata(
reporton=reporton,
ds=ds,
Expand Down
23 changes: 13 additions & 10 deletions datalad_deprecated/metadata/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@
get_suggestions_msg,
shortened_repr,
)

from datalad_deprecated.metadata.metadata import query_aggregated_metadata
from datalad_deprecated.metadata.utils import (

from .utils import (
as_unicode,
unicode_srctypes,
)
Expand Down Expand Up @@ -777,7 +777,7 @@ class _EGrepCSSearch(_Search):
_mode_label = 'egrepcs'
_default_documenttype = 'datasets'

def __init__(self, ds, metadata_source=None, **kwargs):
def __init__(self, ds, metadata_source='all', **kwargs):
super(_EGrepCSSearch, self).__init__(ds, metadata_source, **kwargs)
self._queried_keys = None # to be memoized by get_query

Expand Down Expand Up @@ -1320,14 +1320,14 @@ class Search(Interface):
for debugging purposes."""),
metadata_source=Parameter(
args=('--metadata-source',),
choices=('legacy', 'gen4'),
default=None,
choices=('legacy', 'gen4', 'all'),
doc="""if given, defines which metadata source will be used to
search. 'legacy' will limit search to metadata in the old format,
i.e. stored in '$DATASET/.datalad/metadata'. 'gen4' will limit
search to metadata stored by the git-backend of
'datalad-metadata-model'. If not given, metadata from all supported
sources will be included in search.""")
'datalad-metadata-model'. If 'all' is given, metadata from all
supported sources will be included in the search. The default is
'legacy'.""")
)

@staticmethod
Expand All @@ -1342,7 +1342,7 @@ def __call__(query=None,
full_record=False,
show_keys=None,
show_query=False,
metadata_source=None):
metadata_source='legacy'):
try:
ds = require_dataset(dataset, check_installed=True, purpose='dataset search')
if ds.id is None:
Expand All @@ -1357,7 +1357,7 @@ def __call__(query=None,

if mode is None:
# let's get inspired by what the dataset/user think is
# default.
# default
mode = ds.config.obtain('datalad.search.default-mode')

if mode == 'egrep':
Expand All @@ -1373,7 +1373,10 @@ def __call__(query=None,
'unknown search mode "{}"'.format(mode))

searcher = searcher(
ds, metadata_source=metadata_source, force_reindex=force_reindex)
ds,
metadata_source=metadata_source,
force_reindex=force_reindex
)

if show_keys:
searcher.show_keys(show_keys, regexes=query)
Expand Down
21 changes: 9 additions & 12 deletions datalad_deprecated/metadata/tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from uuid import UUID
from pkg_resources import EntryPoint

from unittest import SkipTest
from unittest.mock import (
MagicMock,
patch,
Expand All @@ -31,7 +30,7 @@
)
from datalad.support.exceptions import NoDatasetFound
from datalad.tests.utils_pytest import (
#SkipTest,
SkipTest,
assert_equal,
assert_in,
assert_is_generator,
Expand All @@ -40,7 +39,7 @@
assert_repo_status,
assert_result_count,
eq_,
skip_if_adjusted_branch,
known_failure_githubci_win,
ok_file_under_git,
patch_config,
with_tempfile,
Expand All @@ -64,7 +63,6 @@
)



@with_testsui(interactive=False)
@with_tempfile(mkdir=True)
def test_search_outside1_noninteractive_ui(tdir=None):
Expand Down Expand Up @@ -175,7 +173,7 @@ def test_search_non_dataset(tdir=None):
assert_in("datalad create --force", str(cme.value))


@skip_if_adjusted_branch
@known_failure_githubci_win
@with_tempfile(mkdir=True)
def test_within_ds_file_search(path=None):
try:
Expand Down Expand Up @@ -249,7 +247,7 @@ def test_within_ds_file_search(path=None):
# test default behavior while limiting set of keys reported
with swallow_outputs() as cmo:
ds.search([r'\.id', 'artist$'], show_keys='short')
out_lines = [l for l in cmo.out.split(os.linesep) if l]
out_lines = [l for l in cmo.out.splitlines() if l]
# test that only the ones matching were returned
assert_equal(
[l for l in out_lines if not l.startswith(' ')],
Expand Down Expand Up @@ -581,14 +579,12 @@ def mocked_no_metadata(**kwargs):
reporton='all',
ds=DatasetMock('ds'),
aps=mocked_annotated_paths,
metadata_source='gen4'
)
if metadata_result['status'] == 'ok'
]
assert_equal(len(result), 1)
assert_equal(
result[0]['metadata'],
{extractor_name: extracted_metadata}
)
assert len(result) == 1
assert result[0]['metadata'] == {extractor_name: extracted_metadata}

# check no metadata found-handling
dump_mock.reset_mock()
Expand All @@ -600,6 +596,7 @@ def mocked_no_metadata(**kwargs):
reporton='all',
ds=DatasetMock('ds'),
aps=mocked_annotated_paths,
metadata_source='all',
)
if metadata_result['status'] == 'impossible'
]
Expand Down Expand Up @@ -636,7 +633,7 @@ def reset_mock(mock, result):
reset_mock(legacy_mock, legacy_result)
r = tuple(
result["metadata_source"]
for result in search(dataset=temp_ds, query="v1")
for result in search(dataset=temp_ds, query="v1", metadata_source='all')
)
assert_equal(r, ("legacy", "gen4"))

Expand Down
5 changes: 5 additions & 0 deletions datalad_deprecated/metadata/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from datalad.support.json_py import load_stream
from datalad.utils import ensure_unicode


Expand Down Expand Up @@ -51,3 +52,7 @@ def as_unicode(val, cast_types=object):
raise TypeError(
"Value %r is not of any of known or provided %s types"
% (val, cast_types))


def load_xzstream(fname):
yield from load_stream(fname, compressed=True)

0 comments on commit 6f5a029

Please sign in to comment.