Skip to content

Commit

Permalink
fix: improve container discovery in graph traversal
Browse files Browse the repository at this point in the history
The previous implementation was missing potential containers for new resources
because it only followed outgoing 'part_of' relationships. This could prevent
reaching container resources in cases where the input resource was the subject
(not the object) of the 'part_of' relationship.

The fix adds inverse relationship traversal for non-root resources while
maintaining efficiency by:
1. Only checking inverse relationships for resources that could be containers
2. Skipping this check for root types (resources that can never be containers)

This ensures complete container discovery while avoiding unnecessary queries
for leaf resources in the containment hierarchy.
  • Loading branch information
arcangelo7 committed Jan 22, 2025
1 parent 6d61847 commit 45f285c
Show file tree
Hide file tree
Showing 7 changed files with 605 additions and 22 deletions.
23 changes: 23 additions & 0 deletions oc_meta/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,30 @@
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
# SOFTWARE.

from oc_ocdm.graph.graph_entity import GraphEntity


FORBIDDEN_IDS = {'issn:0000-0000'}
VENUES = {'archival-document', 'book', 'book-part', 'book-section', 'book-series', 'book-set', 'edited-book', 'journal', 'journal-volume', 'journal-issue', 'monograph', 'proceedings-series', 'proceedings', 'reference-book', 'report-series', 'standard-series'}
CONTAINER_EDITOR_TYPES = {'book chapter', 'book part', 'book section', 'book track', 'component', 'proceedings article', 'reference entry'}

# Constants for bibliographic resources that cannot contain other resources
ROOT_CONTAINER_TYPES = {
GraphEntity.iri_journal_article, # fabio:JournalArticle
GraphEntity.iri_book_chapter, # fabio:BookChapter
GraphEntity.iri_proceedings_paper, # fabio:ProceedingsPaper
GraphEntity.iri_journal_editorial, # fabio:JournalEditorial
GraphEntity.iri_newspaper_article, # fabio:NewspaperArticle
GraphEntity.iri_newspaper_editorial, # fabio:NewspaperEditorial
GraphEntity.iri_reference_entry, # fabio:ReferenceEntry
GraphEntity.iri_retraction_notice, # fabio:RetractionNotice
GraphEntity.iri_peer_review, # fabio:PeerReview
GraphEntity.iri_data_file, # fabio:DataFile
GraphEntity.iri_computer_program, # fabio:ComputerProgram
GraphEntity.iri_presentation, # fabio:Presentation
GraphEntity.iri_web_content, # fabio:WebContent
GraphEntity.iri_data_management_plan, # fabio:DataManagementPlan
GraphEntity.iri_abstract, # fabio:Abstract
GraphEntity.iri_editorial, # fabio:Editorial
GraphEntity.iri_preprint, # fabio:Preprint
}
23 changes: 16 additions & 7 deletions oc_meta/core/curator.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ def clean_vvi(self, row: Dict[str, str]) -> None:
issue = row['issue']
br_id = row['id']
venue = row['venue']

# Venue
if venue:
# The data must be invalidated, because the resource is journal but a volume or an issue have also been specified
Expand All @@ -306,6 +307,7 @@ def clean_vvi(self, row: Dict[str, str]) -> None:
else:
idslist = re.split(one_or_more_spaces, re.sub(colon_and_spaces, ':', venue_id))
idslist, metaval = self.clean_id_list(idslist, br=True, valid_dois_cache=self.valid_dois_cache)

metaval = self.id_worker('venue', name, idslist, metaval, ra_ent=False, br_ent=True, vvi_ent=True, publ_entity=False)
if metaval not in self.vvi:
ts_vvi = None
Expand All @@ -324,34 +326,40 @@ def clean_vvi(self, row: Dict[str, str]) -> None:
self.vvi[metaval]['volume'] = dict()
self.vvi[metaval]['issue'] = dict()
row['venue'] = metaval

# Volume
if volume and (br_type == 'journal issue' or br_type == 'journal article'):
if volume in self.vvi[metaval]['volume']:
vol_meta = self.vvi[metaval]['volume'][volume]['id']
else:
vol_meta = self.new_entity(self.brdict, '')
self.vvi[metaval]['volume'][volume] = dict()
self.vvi[metaval]['volume'][volume]['id'] = vol_meta
self.vvi[metaval]['volume'][volume]['issue'] = dict()
# Check if volume exists in triplestore before creating new one
ts_vvi = self.finder.retrieve_venue_from_meta(metaval)
if volume in ts_vvi['volume']:
vol_meta = ts_vvi['volume'][volume]['id']
# Update local structure with triplestore data
self.vvi[metaval]['volume'][volume] = ts_vvi['volume'][volume]
else:
vol_meta = self.new_entity(self.brdict, '')
self.vvi[metaval]['volume'][volume] = dict()
self.vvi[metaval]['volume'][volume]['id'] = vol_meta
self.vvi[metaval]['volume'][volume]['issue'] = dict()
elif volume and br_type == 'journal volume':
# The data must be invalidated, because the resource is a journal volume but an issue has also been specified
if issue:
row['volume'] = ''
row['issue'] = ''
else:
vol_meta = br_id
self.volume_issue(vol_meta, self.vvi[metaval]['volume'], volume, row)

# Issue
if issue and br_type == 'journal article':
row['issue'] = issue
if vol_meta:
# issue inside volume
if issue not in self.vvi[metaval]['volume'][volume]['issue']:
issue_meta = self.new_entity(self.brdict, '')
self.vvi[metaval]['volume'][volume]['issue'][issue] = dict()
self.vvi[metaval]['volume'][volume]['issue'][issue]['id'] = issue_meta
else:
# issue inside venue (without volume)
if issue not in self.vvi[metaval]['issue']:
issue_meta = self.new_entity(self.brdict, '')
self.vvi[metaval]['issue'][issue] = dict()
Expand All @@ -362,6 +370,7 @@ def clean_vvi(self, row: Dict[str, str]) -> None:
self.volume_issue(issue_meta, self.vvi[metaval]['volume'][volume]['issue'], issue, row)
else:
self.volume_issue(issue_meta, self.vvi[metaval]['issue'], issue, row)

else:
row['venue'] = ''
row['volume'] = ''
Expand Down
40 changes: 38 additions & 2 deletions oc_meta/lib/finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import yaml
from dateutil import parser
from oc_meta.constants import ROOT_CONTAINER_TYPES
from oc_meta.plugins.editor import MetaEditor
from oc_ocdm.graph import GraphEntity
from oc_ocdm.graph.graph_entity import GraphEntity
Expand Down Expand Up @@ -355,11 +356,13 @@ def retrieve_venue_from_meta(self, meta_id:str) -> Dict[str, Dict[str, str]]:
content['issue'] = dict()
content['volume'] = dict()
content = self.__retrieve_vvi(meta_id, content)

return content

def __retrieve_vvi(self, meta:str, content:Dict[str, dict]) -> dict:
venue_iri = URIRef(f'{self.base_iri}/br/{meta}')
ress = []

for triple in self.local_g.triples((None, GraphEntity.iri_part_of, venue_iri)):
res = {'res': None, 'type': None, 'sequence_identifier': None, 'container': None}
res['res'] = triple[0].replace(f'{self.base_iri}/br/', '')
Expand All @@ -371,6 +374,7 @@ def __retrieve_vvi(self, meta:str, content:Dict[str, dict]) -> dict:
elif res_triple[1] == GraphEntity.iri_part_of:
res['container'] = res_triple[2]
ress.append(res)

for res in ress:
if res['res'] is not None:
if res['type'] == GraphEntity.iri_journal_issue and res['container'] == venue_iri:
Expand All @@ -380,6 +384,7 @@ def __retrieve_vvi(self, meta:str, content:Dict[str, dict]) -> dict:
content['volume'].setdefault(res['sequence_identifier'], dict())
content['volume'][res['sequence_identifier']]['id'] = res['res']
content['volume'][res['sequence_identifier']]['issue'] = self.__retrieve_issues_by_volume(URIRef(f"{self.base_iri}/br/{res['res']}"))

return content

def __retrieve_issues_by_volume(self, res:URIRef) -> dict:
Expand All @@ -389,6 +394,7 @@ def __retrieve_issues_by_volume(self, res:URIRef) -> dict:
if res_triple[1] == GraphEntity.iri_has_sequence_identifier:
content.setdefault(str(res_triple[2]), dict())
content[str(res_triple[2])]['id'] = res_triple[0].replace(f'{self.base_iri}/br/', '')

return content

def retrieve_ra_sequence_from_br_meta(self, metaid: str, col_name: str) -> List[Dict[str, tuple]]:
Expand Down Expand Up @@ -835,14 +841,18 @@ def process_batch(subjects, cur_depth):

next_subjects = set()
for batch in batch_process(list(subjects), BATCH_SIZE):
# Query to get direct triples and object types
query_prefix = f'''
SELECT ?s ?p ?o
WHERE {{
VALUES ?s {{ {' '.join([f"<{s}>" for s in batch])} }}
?s ?p ?o.
?s ?p ?o .
}}'''

# Process direct triples and collect objects that could be containers
potential_containers = set()
result = self.__query(query_prefix)

if result:
for row in result['results']['bindings']:
s = URIRef(row['s']['value'])
Expand All @@ -852,10 +862,36 @@ def process_batch(subjects, cur_depth):
o_datatype = URIRef(row['o']['datatype']) if 'datatype' in row['o'] else None
o = URIRef(o) if o_type == 'uri' else Literal(lexical_or_value=o, datatype=o_datatype)
self.local_g.add((s, p, o))
if p == RDF.type and o not in ROOT_CONTAINER_TYPES:
potential_containers.add(str(s))

# Add non-special objects to next_subjects as before
if isinstance(o, URIRef) and p not in {RDF.type, GraphEntity.iri_with_role, GraphEntity.iri_uses_identifier_scheme}:
next_subjects.add(str(o))

# Dopo aver processato tutti i batch di questo livello, procedi con il prossimo livello di profondità
# Only run inverse query for potential containers
if potential_containers:
inverse_query = f'''
SELECT ?s ?p ?o
WHERE {{
VALUES ?container {{ {' '.join([f"<{s}>" for s in potential_containers])} }}
?s <{GraphEntity.iri_part_of}> ?container .
?s ?p ?o .
}}'''

result = self.__query(inverse_query)
if result:
for row in result['results']['bindings']:
s = URIRef(row['s']['value'])
p = URIRef(row['p']['value'])
o = row['o']['value']
o_type = row['o']['type']
o_datatype = URIRef(row['o']['datatype']) if 'datatype' in row['o'] else None
o = URIRef(o) if o_type == 'uri' else Literal(lexical_or_value=o, datatype=o_datatype)
self.local_g.add((s, p, o))
next_subjects.add(str(s))

# Process next level
process_batch(next_subjects, cur_depth + 1)

def get_initial_subjects_from_metavals(metavals):
Expand Down
Loading

0 comments on commit 45f285c

Please sign in to comment.