fix: improve container discovery in graph traversal

The previous implementation was missing potential containers for new resources because it only followed outgoing 'part_of' relationships. This could prevent reaching container resources in cases where the input resource was the subject (not the object) of the 'part_of' relationship. The fix adds inverse relationship traversal for non-root resources while maintaining efficiency by: 1. Only checking inverse relationships for resources that could be containers 2. Skipping this check for root types (resources that can never be containers) This ensures complete container discovery while avoiding unnecessary queries for leaf resources in the containment hierarchy.
opencitations · Jan 22, 2025 · 45f285c · 45f285c
1 parent 6d61847
commit 45f285c
Show file tree

Hide file tree

Showing 7 changed files with 605 additions and 22 deletions.
diff --git a/oc_meta/constants.py b/oc_meta/constants.py
@@ -14,7 +14,30 @@
 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
 # SOFTWARE.
 
+from oc_ocdm.graph.graph_entity import GraphEntity
+
 
 FORBIDDEN_IDS = {'issn:0000-0000'}
 VENUES = {'archival-document', 'book', 'book-part', 'book-section', 'book-series', 'book-set', 'edited-book', 'journal', 'journal-volume', 'journal-issue', 'monograph', 'proceedings-series', 'proceedings', 'reference-book', 'report-series', 'standard-series'}
 CONTAINER_EDITOR_TYPES = {'book chapter', 'book part', 'book section', 'book track', 'component', 'proceedings article', 'reference entry'}
+
+# Constants for bibliographic resources that cannot contain other resources
+ROOT_CONTAINER_TYPES = {
+    GraphEntity.iri_journal_article,              # fabio:JournalArticle
+    GraphEntity.iri_book_chapter,                 # fabio:BookChapter
+    GraphEntity.iri_proceedings_paper,            # fabio:ProceedingsPaper
+    GraphEntity.iri_journal_editorial,            # fabio:JournalEditorial
+    GraphEntity.iri_newspaper_article,            # fabio:NewspaperArticle
+    GraphEntity.iri_newspaper_editorial,          # fabio:NewspaperEditorial
+    GraphEntity.iri_reference_entry,              # fabio:ReferenceEntry
+    GraphEntity.iri_retraction_notice,            # fabio:RetractionNotice
+    GraphEntity.iri_peer_review,                  # fabio:PeerReview
+    GraphEntity.iri_data_file,                    # fabio:DataFile
+    GraphEntity.iri_computer_program,             # fabio:ComputerProgram
+    GraphEntity.iri_presentation,                 # fabio:Presentation
+    GraphEntity.iri_web_content,                  # fabio:WebContent
+    GraphEntity.iri_data_management_plan,         # fabio:DataManagementPlan
+    GraphEntity.iri_abstract,                     # fabio:Abstract
+    GraphEntity.iri_editorial,                    # fabio:Editorial
+    GraphEntity.iri_preprint,                     # fabio:Preprint
+}
diff --git a/oc_meta/core/curator.py b/oc_meta/core/curator.py
@@ -290,6 +290,7 @@ def clean_vvi(self, row: Dict[str, str]) -> None:
         issue = row['issue']
         br_id = row['id']
         venue = row['venue']
+
         # Venue
         if venue:
             # The data must be invalidated, because the resource is journal but a volume or an issue have also been specified
@@ -306,6 +307,7 @@ def clean_vvi(self, row: Dict[str, str]) -> None:
                 else:
                     idslist = re.split(one_or_more_spaces, re.sub(colon_and_spaces, ':', venue_id))
                 idslist, metaval = self.clean_id_list(idslist, br=True, valid_dois_cache=self.valid_dois_cache)
+
                 metaval = self.id_worker('venue', name, idslist, metaval, ra_ent=False, br_ent=True, vvi_ent=True, publ_entity=False)
                 if metaval not in self.vvi:
                     ts_vvi = None
@@ -324,34 +326,40 @@ def clean_vvi(self, row: Dict[str, str]) -> None:
                 self.vvi[metaval]['volume'] = dict()
                 self.vvi[metaval]['issue'] = dict()
             row['venue'] = metaval
+
             # Volume
             if volume and (br_type == 'journal issue' or br_type == 'journal article'):
                 if volume in self.vvi[metaval]['volume']:
                     vol_meta = self.vvi[metaval]['volume'][volume]['id']
                 else:
-                    vol_meta = self.new_entity(self.brdict, '')
-                    self.vvi[metaval]['volume'][volume] = dict()
-                    self.vvi[metaval]['volume'][volume]['id'] = vol_meta
-                    self.vvi[metaval]['volume'][volume]['issue'] = dict()
+                    # Check if volume exists in triplestore before creating new one
+                    ts_vvi = self.finder.retrieve_venue_from_meta(metaval)
+                    if volume in ts_vvi['volume']:
+                        vol_meta = ts_vvi['volume'][volume]['id']
+                        # Update local structure with triplestore data
+                        self.vvi[metaval]['volume'][volume] = ts_vvi['volume'][volume]
+                    else:
+                        vol_meta = self.new_entity(self.brdict, '')
+                        self.vvi[metaval]['volume'][volume] = dict()
+                        self.vvi[metaval]['volume'][volume]['id'] = vol_meta
+                        self.vvi[metaval]['volume'][volume]['issue'] = dict()
             elif volume and br_type == 'journal volume':
-                # The data must be invalidated, because the resource is a journal volume but an issue has also been specified
                 if issue:
                     row['volume'] = ''
                     row['issue'] = ''
                 else:
                     vol_meta = br_id
                     self.volume_issue(vol_meta, self.vvi[metaval]['volume'], volume, row)
+
             # Issue
             if issue and br_type == 'journal article':
                 row['issue'] = issue
                 if vol_meta:
-                    # issue inside volume
                     if issue not in self.vvi[metaval]['volume'][volume]['issue']:
                         issue_meta = self.new_entity(self.brdict, '')
                         self.vvi[metaval]['volume'][volume]['issue'][issue] = dict()
                         self.vvi[metaval]['volume'][volume]['issue'][issue]['id'] = issue_meta
                 else:
-                    # issue inside venue (without volume)
                     if issue not in self.vvi[metaval]['issue']:
                         issue_meta = self.new_entity(self.brdict, '')
                         self.vvi[metaval]['issue'][issue] = dict()
@@ -362,6 +370,7 @@ def clean_vvi(self, row: Dict[str, str]) -> None:
                     self.volume_issue(issue_meta, self.vvi[metaval]['volume'][volume]['issue'], issue, row)
                 else:
                     self.volume_issue(issue_meta, self.vvi[metaval]['issue'], issue, row)
+
         else:
             row['venue'] = ''
             row['volume'] = ''

diff --git a/oc_meta/lib/finder.py b/oc_meta/lib/finder.py
@@ -5,6 +5,7 @@
 
 import yaml
 from dateutil import parser
+from oc_meta.constants import ROOT_CONTAINER_TYPES
 from oc_meta.plugins.editor import MetaEditor
 from oc_ocdm.graph import GraphEntity
 from oc_ocdm.graph.graph_entity import GraphEntity
@@ -355,11 +356,13 @@ def retrieve_venue_from_meta(self, meta_id:str) -> Dict[str, Dict[str, str]]:
         content['issue'] = dict()
         content['volume'] = dict()
         content = self.__retrieve_vvi(meta_id, content)
+
         return content
 
     def __retrieve_vvi(self, meta:str, content:Dict[str, dict]) -> dict:
         venue_iri = URIRef(f'{self.base_iri}/br/{meta}')
         ress = []
+
         for triple in self.local_g.triples((None, GraphEntity.iri_part_of, venue_iri)):
             res = {'res': None, 'type': None, 'sequence_identifier': None, 'container': None}
             res['res'] = triple[0].replace(f'{self.base_iri}/br/', '')
@@ -371,6 +374,7 @@ def __retrieve_vvi(self, meta:str, content:Dict[str, dict]) -> dict:
                 elif res_triple[1] == GraphEntity.iri_part_of:
                     res['container'] = res_triple[2]
             ress.append(res)
+
         for res in ress:
             if res['res'] is not None:
                 if res['type'] == GraphEntity.iri_journal_issue and res['container'] == venue_iri:
@@ -380,6 +384,7 @@ def __retrieve_vvi(self, meta:str, content:Dict[str, dict]) -> dict:
                     content['volume'].setdefault(res['sequence_identifier'], dict())
                     content['volume'][res['sequence_identifier']]['id'] = res['res']
                     content['volume'][res['sequence_identifier']]['issue'] = self.__retrieve_issues_by_volume(URIRef(f"{self.base_iri}/br/{res['res']}"))
+
         return content
 
     def __retrieve_issues_by_volume(self, res:URIRef) -> dict:
@@ -389,6 +394,7 @@ def __retrieve_issues_by_volume(self, res:URIRef) -> dict:
                 if res_triple[1] == GraphEntity.iri_has_sequence_identifier:
                     content.setdefault(str(res_triple[2]), dict())
                     content[str(res_triple[2])]['id'] = res_triple[0].replace(f'{self.base_iri}/br/', '')
+
         return content
 
     def retrieve_ra_sequence_from_br_meta(self, metaid: str, col_name: str) -> List[Dict[str, tuple]]:
@@ -835,14 +841,18 @@ def process_batch(subjects, cur_depth):
 
             next_subjects = set()
             for batch in batch_process(list(subjects), BATCH_SIZE):
+                # Query to get direct triples and object types
                 query_prefix = f'''
                     SELECT ?s ?p ?o
                     WHERE {{
                         VALUES ?s {{ {' '.join([f"<{s}>" for s in batch])} }}
-                        ?s ?p ?o.
+                        ?s ?p ?o .
                     }}'''
 
+                # Process direct triples and collect objects that could be containers
+                potential_containers = set()
                 result = self.__query(query_prefix)
+
                 if result:
                     for row in result['results']['bindings']:
                         s = URIRef(row['s']['value'])
@@ -852,10 +862,36 @@ def process_batch(subjects, cur_depth):
                         o_datatype = URIRef(row['o']['datatype']) if 'datatype' in row['o'] else None
                         o = URIRef(o) if o_type == 'uri' else Literal(lexical_or_value=o, datatype=o_datatype)
                         self.local_g.add((s, p, o))
+                        if p == RDF.type and o not in ROOT_CONTAINER_TYPES:
+                            potential_containers.add(str(s))
+
+                        # Add non-special objects to next_subjects as before
                         if isinstance(o, URIRef) and p not in {RDF.type, GraphEntity.iri_with_role, GraphEntity.iri_uses_identifier_scheme}:
                             next_subjects.add(str(o))
 
-            # Dopo aver processato tutti i batch di questo livello, procedi con il prossimo livello di profondità
+                # Only run inverse query for potential containers
+                if potential_containers:
+                    inverse_query = f'''
+                        SELECT ?s ?p ?o
+                        WHERE {{
+                            VALUES ?container {{ {' '.join([f"<{s}>" for s in potential_containers])} }}
+                            ?s <{GraphEntity.iri_part_of}> ?container .
+                            ?s ?p ?o .
+                        }}'''
+
+                    result = self.__query(inverse_query)
+                    if result:
+                        for row in result['results']['bindings']:
+                            s = URIRef(row['s']['value'])
+                            p = URIRef(row['p']['value'])
+                            o = row['o']['value']
+                            o_type = row['o']['type']
+                            o_datatype = URIRef(row['o']['datatype']) if 'datatype' in row['o'] else None
+                            o = URIRef(o) if o_type == 'uri' else Literal(lexical_or_value=o, datatype=o_datatype)
+                            self.local_g.add((s, p, o))
+                            next_subjects.add(str(s))
+
+            # Process next level
             process_batch(next_subjects, cur_depth + 1)
 
         def get_initial_subjects_from_metavals(metavals):