Skip to content

Commit

Permalink
Merge branch 'main' into dependabot/pip/invenio-accounts-5.1.7
Browse files Browse the repository at this point in the history
  • Loading branch information
GraemeWatt authored Dec 4, 2024
2 parents 122954e + ca52b85 commit cc0dce6
Showing 15 changed files with 523 additions and 37 deletions.
4 changes: 2 additions & 2 deletions hepdata/cli.py
Original file line number Diff line number Diff line change
@@ -223,9 +223,9 @@ def do_unload(records_to_unload):

@utils.command()
@with_appcontext
@click.option('--endpoint', '-e', type=str, help='Specific endpoint to update (e.g. "rivet" or "MadAnalysis" or "SModelS"). Omit for all.')
@click.option('--endpoint', '-e', type=str, help='Specific endpoint to update (e.g. "rivet" or "MadAnalysis" or "SModelS" or "Combine"). Omit for all.')
def find_and_add_record_analyses(endpoint):
"""Finds analyses such as Rivet, MadAnalysis 5 and SModelS and adds them to records."""
"""Finds analyses such as Rivet, MadAnalysis 5, SModelS and Combine and adds them to records."""
update_analyses(endpoint)


12 changes: 11 additions & 1 deletion hepdata/config.py
Original file line number Diff line number Diff line change
@@ -193,6 +193,7 @@ def _(x):
CFG_DATA_TYPE = 'datatable'
CFG_SUBMISSIONS_TYPE = 'submission'
CFG_DATA_KEYWORDS = ['observables', 'reactions', 'cmenergies', 'phrases']
CFG_SEARCH_RANGE_TERMS = ["recid", "publication_recid", "inspire_id"] # Possible terms used to OpenSearch API range searches

CFG_CONVERTER_URL = 'https://converter.hepdata.net'
CFG_SUPPORTED_FORMATS = ['yaml', 'root', 'csv', 'yoda', 'yoda1', 'original']
@@ -331,7 +332,16 @@ def _(x):
'endpoint_url': 'https://zenodo.org/records/13952092/files/smodels-analyses.hepdata.json?download=1',
'url_template': '{0}',
'subscribe_user_id': 7766
}
},
'Combine': {
'endpoint_url': 'https://cms-public-likelihoods-list.web.cern.ch/artifacts/output.json',
'url_template': 'https://doi.org/{0}',
'description': 'Statistical models',
'license': {
'name': 'cc-by-4.0',
'url': 'https://creativecommons.org/licenses/by/4.0'
},
},
#'ufo': {},
#'xfitter': {},
#'applgrid': {},
59 changes: 40 additions & 19 deletions hepdata/ext/opensearch/api.py
Original file line number Diff line number Diff line change
@@ -96,8 +96,9 @@ def search(query,
('collaboration', collaboration_name), ('date', date)
:param size: [int] max number of hits that should be returned
:param offset: [int] offset for the results (used for pagination)
:param sort_by: [string] sorting field. Currently supported fields:
"title", "collaboration", "date", "relevance"
:param sort_field: [string] sorting field. Currently supported fields:
"title", "collaboration", "date", "relevance",
"recid", "inspire_id"
:param sort_order: [string] order of the sorting either original
(for a particular field) or reversed. Supported:
'' or 'rev'
@@ -108,23 +109,41 @@ def search(query,
if query == '' and not sort_field:
sort_field = 'date'

query = HEPDataQueryParser.parse_query(query)
# Create search with preference param to ensure consistency of results across shards
search = RecordsSearch(using=os, index=index).with_preference_param()

# Determine if the query is range-based, and get it, or the default search order
range_terms, exclude_tables, parsed_query = HEPDataQueryParser.parse_range_query(query)

# We passed the newly range-parsed query to be parsed
query = HEPDataQueryParser.parse_query(parsed_query)
fuzzy_query = QueryString(query=query, fuzziness='AUTO')

if query:
fuzzy_query = QueryString(query=query, fuzziness='AUTO')
if exclude_tables:
search.query = fuzzy_query

if query and not exclude_tables:
search.query = fuzzy_query | \
Q('has_child', type="child_datatable", query=fuzzy_query)

# Add filter to search for only "publication" objects
search = search.filter("term", doc_type=CFG_PUB_TYPE)
search = QueryBuilder.add_filters(search, filters)


if range_terms and not sort_field and not sort_order:
# Set default search keyword, and set default sort to desc
sort_field = 'recid'
sort_order = 'desc'

try:
mapped_sort_field = sort_fields_mapping(sort_field)
except ValueError as ve:
return {'error': str(ve)}

search = search.sort({mapped_sort_field : {"order" : calculate_sort_order(sort_order, sort_field)}})

search = add_default_aggregations(search, filters)

if post_filter:
@@ -135,23 +154,25 @@ def search(query,

try:
pub_result = search.execute().to_dict()

parent_filter = {
"terms": {
"_id": [hit["_id"] for hit in pub_result['hits']['hits']]
data_result = {}
if not exclude_tables:
parent_filter = {
"terms": {
"_id": [hit["_id"] for hit in pub_result['hits']['hits']]
}
}
}

data_search = RecordsSearch(using=os, index=index)
data_search = data_search.query('has_parent',
parent_type="parent_publication",
query=parent_filter)
if query:
data_search = data_search.query(QueryString(query=query))
data_search = RecordsSearch(using=os, index=index)
data_search = data_search.query('has_parent',
parent_type="parent_publication",
query=parent_filter)

if query:
data_search = data_search.query(QueryString(query=query))

data_search_size = size * OPENSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE
data_search = data_search[0:data_search_size]
data_result = data_search.execute().to_dict()
data_search_size = size * OPENSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE
data_search = data_search[0:data_search_size]
data_result = data_search.execute().to_dict()

merged_results = merge_results(pub_result, data_result)
return map_result(merged_results, filters)
@@ -165,7 +186,7 @@ def search(query,
else:
log.error(f'An unexpected error occurred when searching: {e}')
reason = f'An unexpected error occurred: {e.error}'
return { 'error': reason }
return {'error': reason}


@author_index
6 changes: 6 additions & 0 deletions hepdata/ext/opensearch/config/os_config.py
Original file line number Diff line number Diff line change
@@ -113,6 +113,12 @@ def sort_fields_mapping(sort_by):
return 'creation_date'
elif sort_by == 'latest':
return 'last_updated'
elif sort_by == 'recid':
return 'recid' # No change required
elif sort_by == 'publication_recid':
return 'publication_recid' # No change required
elif sort_by == 'inspire_id':
return 'inspire_id' # No change required
elif not sort_by or sort_by == 'relevance':
return '_score'
else:
2 changes: 1 addition & 1 deletion hepdata/ext/opensearch/config/record_mapping.py
Original file line number Diff line number Diff line change
@@ -171,7 +171,7 @@
}
},
"inspire_id": {
"type": "text"
"type": "integer"
},
"keywords": {
"properties": {
2 changes: 1 addition & 1 deletion hepdata/ext/opensearch/document_enhancers.py
Original file line number Diff line number Diff line change
@@ -94,7 +94,7 @@ def add_shortened_authors(doc):

def add_analyses(doc):
"""
Add analyses links such as Rivet, MadAnalysis 5, SModelS, HistFactory and NUISANCE to the index.
Add analyses links such as Rivet, MadAnalysis 5, SModelS, Combine, HistFactory and NUISANCE to the index.
:param doc:
:return:
22 changes: 19 additions & 3 deletions hepdata/ext/opensearch/process_results.py
Original file line number Diff line number Diff line change
@@ -27,10 +27,26 @@
from hepdata.utils.miscellaneous import splitter


def merge_results(pub_result, data_result):
def merge_results(pub_result, data_result=None):
"""
Merge results dictionaries of publication and data table
search result data.
Data result does not exist in publication-only searches,
so defaults to None.
:param pub_result: Publication search data.
:param data_result: Data table search data.
:return: Merged search results dictionary.
"""
merge_dict = dict()
merge_dict['hits'] = pub_result['hits']['hits'] + \
data_result['hits']['hits']

# We don't need to merge if there is no data.
if data_result:
merge_dict['hits'] = pub_result['hits']['hits'] + \
data_result['hits']['hits']
else:
merge_dict['hits'] = pub_result['hits']['hits']

merge_dict['total'] = pub_result['hits']['total']['value']
merge_dict['aggregations'] = pub_result.get('aggregations', {})
return merge_dict
40 changes: 39 additions & 1 deletion hepdata/ext/opensearch/query_builder.py
Original file line number Diff line number Diff line change
@@ -23,6 +23,8 @@
import re
from opensearch_dsl import Q

from hepdata.config import CFG_SEARCH_RANGE_TERMS


class QueryBuilder:

@@ -52,7 +54,8 @@ def parse_query(query_string):
"phrases": "data_keywords.phrases",
"reactions": "data_keywords.reactions",
"analysis": "analyses.type",
"resources": "resources.description" # Add shorthand for resource description
"resources": "resources.description", # Add shorthand for resource description
"publication_recid": "recid" # Shorthand for HEPData record ID
}
}

@@ -81,3 +84,38 @@ def _quote_phrase(phrase):
if '"' not in phrase and pattern.fullmatch(phrase):
return f'"{phrase}"'
return phrase

@staticmethod
def parse_range_query(query):
"""
Parses and verifies whether a parsed query string contains a range-based query.
If it does, return either that search keyword,
or the "default" keyword for default search ordering.
Also determines if the query is a publication-only search, where tables are excluded.
Returns the query with publication_recid replaced with 'recid' for opensearch.
Examples: publication_recid:[321 TO 321] inspire_id:[123 TO 123]
:param query: The full query string
:return: A tuple containing a list of parsed range terms,
and a boolean determining whether table exclusion should occur (if range term is publication_recid,
or inspire_id), and the query with term replaced.
"""
# Pattern matching docstring example with placeholder
pattern = rf"(?:^|\s)%s:\s*\[\d+\s+TO\s+\d+]"
range_terms = []
exclude_tables = False
# For all terms that can be range searched
for term in CFG_SEARCH_RANGE_TERMS:
result = re.findall(pattern % term, query)
if result:
range_terms.append(term)

# If we are doing a range search on non-table objects
if ("publication_recid" in range_terms or "inspire_id" in range_terms) and "recid" not in range_terms:
exclude_tables = True

# Finally, we replace publication_recid with the correct mapping for OpenSearch
query = query.replace("publication_recid", "recid")

return range_terms, exclude_tables, query
1 change: 1 addition & 0 deletions hepdata/modules/records/assets/js/hepdata_common.js
Original file line number Diff line number Diff line change
@@ -46,6 +46,7 @@ HEPDATA.file_type_to_details = {
"rivet": {"icon": "area-chart", "description": "Rivet Analysis"},
"madanalysis": {"icon": "area-chart", "description": "MadAnalysis 5 Analysis"},
"smodels": {"icon": "area-chart", "description": "SModelS Analysis"},
"combine": {"icon": "area-chart", "description": "Combine Analysis"},
"xfitter": {"icon": "area-chart", "description": "xFitter Analysis"},
"applgrid": {"icon": "area-chart", "description": "APPLgrid Analysis"},
"ufo": {"icon": "rocket", "description": "Universal Feynrules Output (UFO)"},
Original file line number Diff line number Diff line change
@@ -41,6 +41,7 @@ <h4>Add Resource for <span id="selected_resource_item">Submission</span></h4>
<option value="applgrid">APPLgrid</option>
<option value="MadAnalysis">MadAnalysis 5</option>
<option value="SModelS">SModelS</option>
<option value="Combine">Combine</option>
<option value="rivet">Rivet</option>
<option value="fastnlo">fastNLO</option>
<option value="ufo">Universal Feynrules Output (UFO)</option>
13 changes: 11 additions & 2 deletions hepdata/modules/records/utils/analyses.py
Original file line number Diff line number Diff line change
@@ -35,6 +35,7 @@
from hepdata.utils.users import get_user_from_id
from hepdata.modules.records.subscribers.rest import subscribe
from hepdata.modules.records.subscribers.api import is_current_user_subscribed_to_record
from hepdata.modules.records.utils.common import get_license

logging.basicConfig()
log = logging.getLogger(__name__)
@@ -43,10 +44,11 @@
@shared_task
def update_analyses(endpoint=None):
"""
Update (Rivet, MadAnalysis 5 and SModelS) analyses and remove outdated resources.
Update (Rivet, MadAnalysis 5, SModelS and Combine) analyses and remove outdated resources.
Allow bulk subscription to record update notifications if "subscribe_user_id" in endpoint.
Add optional "description" and "license" fields if present in endpoint.
:param endpoint: either "rivet" or "MadAnalysis" or "SModelS" or None (default) for both
:param endpoint: either "rivet" or "MadAnalysis" or "SModelS" or "Combine" or None (default) for both
"""
endpoints = current_app.config["ANALYSES_ENDPOINTS"]
for analysis_endpoint in endpoints:
@@ -86,6 +88,13 @@ def update_analyses(endpoint=None):
file_location=_resource_url,
file_type=analysis_endpoint)

if "description" in endpoints[analysis_endpoint]:
new_resource.file_description = str(endpoints[analysis_endpoint]["description"])

if "license" in endpoints[analysis_endpoint]:
resource_license = get_license(endpoints[analysis_endpoint]["license"])
new_resource.file_license = resource_license.id

submission.resources.append(new_resource)
num_new_resources += 1

Original file line number Diff line number Diff line change
@@ -243,6 +243,13 @@ <h4>Other useful searches</h4>
(SModelS analysis)
</span>
</li>
<li>
<a href='/search?q=analysis:Combine&sort_by=latest'
target="_new">analysis:Combine</a>
<span class="text-muted">
(CMS statistical models in Combine format)
</span>
</li>
<li>
<a href='/search?q=analysis:HistFactory&sort_by=latest'
target="_new">analysis:HistFactory</a>
@@ -279,6 +286,32 @@ <h4>Searching via Inspire</h4>
</ul>
</div>

<div class="well well-small">
<h4>Range-based Searching</h4>
<p>
We support searching for a range of records using their HEPData record ID or Inspire ID.
</p>
<ul>
<li>Range searching by HEPData record ID:
<ul>
<li>
<a href='/search?q=publication_recid:[1 TO 10]'
target="_new">publication_recid:[1 TO 10]</a>
</li>
</ul>
</li>
<br/>
<li>Range searching by Inspire ID:
<ul>
<li>
<a href='/search?q=inspire_id:[1 TO 10000]'
target="_new">inspire_id:[1 TO 10000]</a>
</li>
</ul>
</li>
</ul>
</div>

</div>
</div>
</div>
2 changes: 1 addition & 1 deletion hepdata/version.py
Original file line number Diff line number Diff line change
@@ -28,4 +28,4 @@
and parsed by ``setup.py``.
"""

__version__ = "0.9.4dev20241112"
__version__ = "0.9.4dev20241204"
19 changes: 18 additions & 1 deletion tests/records_test.py
Original file line number Diff line number Diff line change
@@ -1031,7 +1031,7 @@ def test_create_breadcrumb_text():


def test_update_analyses(app):
""" Test update of Rivet, MadAnalyses 5 and SModelS analyses """
""" Test update of Rivet, MadAnalyses 5, SModelS and Combine analyses """

# Import a record that already has a Rivet analysis attached (but with '#' in the URL)
import_records(['ins1203852'], synchronous=True)
@@ -1074,6 +1074,23 @@ def test_update_analyses(app):
submission = get_latest_hepsubmission(inspire_id='1847779', overall_status='finished')
assert is_current_user_subscribed_to_record(submission.publication_recid, user)

# Import a record that has an associated Combine analysis
import_records(['ins2796231'], synchronous=True)
analysis_resources = DataResource.query.filter_by(file_type='Combine').all()
assert len(analysis_resources) == 0
analysis_resources = DataResource.query.filter_by(file_location='https://doi.org/10.17181/bp9fx-6qs64').all()
assert len(analysis_resources) == 1
db.session.delete(analysis_resources[0]) # delete resource so it can be re-added in next step
db.session.commit()
update_analyses('Combine')
analysis_resources = DataResource.query.filter_by(file_type='Combine').all()
assert len(analysis_resources) == 1
assert analysis_resources[0].file_location == 'https://doi.org/10.17181/bp9fx-6qs64'
assert analysis_resources[0].file_description == 'Statistical models'
license_data = License.query.filter_by(id=analysis_resources[0].file_license).first()
assert license_data.name == 'cc-by-4.0'
assert license_data.url == 'https://creativecommons.org/licenses/by/4.0'


def test_generate_license_data_by_id(app):
"""
Loading

0 comments on commit cc0dce6

Please sign in to comment.