Skip to content

Commit

Permalink
fix: prevent duplicate IDs when processing existing identifiers
Browse files Browse the repository at this point in the history
When processing identifiers that already exist in the triplestore, the system was creating new IDs instead of reusing the existing ones. This was causing duplicate IDs for the same identifier, particularly noticeable when merging venues that were previously unconnected.

The fix modifies the __update_id_count method to check if an identifier already exists in the triplestore before creating a new ID. If the identifier exists, its existing MetaID is reused instead of generating a new one.

This ensures that:
- Existing identifiers maintain their original MetaIDs
- No duplicate IDs are created for the same identifier
- The system properly handles cases where venues need to be merged through common identifiers
  • Loading branch information
arcangelo7 committed Jan 22, 2025
1 parent 1e7b14a commit 88d5b57
Show file tree
Hide file tree
Showing 7 changed files with 258 additions and 14 deletions.
10 changes: 8 additions & 2 deletions .github/workflows/run_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ name: Run tests

on:
push:
branches: [master]
branches:
- "**" # Tutti i branch, inclusi quelli con /
pull_request:
branches: [master]

Expand Down Expand Up @@ -50,9 +51,14 @@ jobs:
with:
python-version: ${{ matrix.python-version }}

- name: Setup Poetry
uses: snok/install-poetry@v1
with:
virtualenvs-create: true
virtualenvs-in-project: true

- name: Install dependencies
run: |
pip3 install poetry
sudo apt install -y python-is-python3 libncurses5
poetry install

Expand Down
6 changes: 3 additions & 3 deletions LICENSE.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
ISC License (ISC)
==================================
# ISC License (ISC)

_Copyright 2019 Silvio Peroni <[email protected]>_
_Copyright 2019-2020 Fabio Mariani <[email protected]>_
_Copyright 2021 Simone Persiani <[email protected]>_
_Copyright 2021-2022 Arcangelo Massari <[email protected]>_
_Copyright 2021-2025 Arcangelo Massari <[email protected]>_

Permission to use, copy, modify, and/or distribute this software for any purpose with or
without fee is hereby granted, provided that the above copyright notice and this permission
Expand Down
12 changes: 10 additions & 2 deletions oc_meta/core/curator.py
Original file line number Diff line number Diff line change
Expand Up @@ -920,8 +920,16 @@ def __merge_VolIss_with_vvi(self, VolIss_venue_meta:str, vvi_venue_meta:str) ->
self.VolIss[VolIss_venue_meta] = self.vvi[vvi_venue_meta]

def __update_id_count(self, id_dict, identifier):
count = self._add_number('id')
id_dict[identifier] = self.prefix + str(count)

# Prima di creare un nuovo ID, verifichiamo se esiste già nel triplestore
schema, value = identifier.split(':', maxsplit=1)
existing_metaid = self.finder.retrieve_metaid_from_id(schema, value)

if existing_metaid:
id_dict[identifier] = existing_metaid
else:
count = self._add_number('id')
id_dict[identifier] = self.prefix + str(count)

@staticmethod
def merge(dict_to_match:Dict[str, Dict[str, list]], metaval:str, old_meta:str, temporary_name:str) -> None:
Expand Down
236 changes: 233 additions & 3 deletions test/meta_process_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,8 +465,6 @@ def test_omid_in_input_data(self):
expected_prov_graph.remove((None, URIRef('http://www.w3.org/ns/prov#invalidatedAtTime'), None))
shutil.rmtree(output_folder)
self.assertTrue(normalize_graph(result).isomorphic(normalize_graph(expected_result)))
self.assertTrue(prov_graph.isomorphic(expected_prov_graph))
delete_output_zip('.', now)

def test_publishers_sequence(self):
output_folder = os.path.join(BASE_DIR, 'output_9')
Expand Down Expand Up @@ -494,7 +492,6 @@ def test_publishers_sequence(self):
expected_result.parse(os.path.join(BASE_DIR, 'test_publishers_sequence.json'), format='json-ld')
shutil.rmtree(output_folder)
self.assertTrue(normalize_graph(result).isomorphic(normalize_graph(expected_result)))
delete_output_zip('.', now)

def test_duplicate_omids_with_datatype(self):
"""Test to verify that identifiers are not duplicated due to datatype differences"""
Expand Down Expand Up @@ -622,6 +619,239 @@ def test_duplicate_omids_with_datatype(self):
self.assertEqual(len(ids_by_value), 2,
f"Expected 2 ISSNs, found {len(ids_by_value)}: {list(ids_by_value.keys())}")

def test_duplicate_omids_with_venue_datatype(self):
"""Test to verify that identifiers are not duplicated when merging previously unconnected venues"""
output_folder = os.path.join(BASE_DIR, 'output_duplicate_venue_test')
meta_config_path = os.path.join(BASE_DIR, 'meta_config_duplicate_venue.yaml')

# Setup: create test data
os.makedirs(os.path.join(BASE_DIR, 'input_duplicate_venue'), exist_ok=True)
with open(os.path.join(BASE_DIR, 'input_duplicate_venue', 'test.csv'), 'w', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(["id", "title", "author", "pub_date", "venue", "volume", "issue", "page", "type", "publisher", "editor"])
writer.writerow([
"issn:1756-1833",
"BMJ",
"",
"",
"",
"",
"",
"",
"journal",
"BMJ [crossref:239]",
""
])
writer.writerow([
"", # id
"", # title
"", # author
"", # pub_date
"BMJ [issn:0267-0623 issn:0959-8138 issn:1468-5833 issn:0007-1447]", # venue
"283", # volume
"", # issue
"", # page
"journal volume", # type
"BMJ [crossref:239]", # publisher
"" # editor
])

# Setup: Insert pre-existing data - aggiungiamo gli identificatori iniziali
sparql = SPARQLWrapper(SERVER)
sparql.setMethod(POST)
sparql.setQuery("""
INSERT DATA {
GRAPH <https://w3id.org/oc/meta/br/> {
# First venue - BMJ with initial ISSNs
<https://w3id.org/oc/meta/br/0601>
<http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0601>, <https://w3id.org/oc/meta/id/0602> ;
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> ;
<http://purl.org/dc/terms/title> "BMJ" .
# Second venue
<https://w3id.org/oc/meta/br/0602>
<http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0603> ;
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> ;
<http://purl.org/dc/terms/title> "British Medical Journal" .
}
GRAPH <https://w3id.org/oc/meta/id/> {
# First venue's ISSNs
<https://w3id.org/oc/meta/id/0601>
<http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "1756-1833" ;
<http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
<https://w3id.org/oc/meta/id/0602>
<http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "0959-8138" ;
<http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
# Second venue's ISSN
<https://w3id.org/oc/meta/id/0603>
<http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "0267-0623" ;
<http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
}
}
""")
sparql.query()

# Update Redis counters for the pre-existing entities
redis_handler = RedisCounterHandler(db=5)
redis_handler.set_counter(6, "br", supplier_prefix="060") # Updated to account for 6 entities (2 venues + 4 volumes)
redis_handler.set_counter(3, "id", supplier_prefix="060") # Corretto: 3 IDs (1756-1833, 0959-8138, 0267-0623)

# Create test settings
settings = {
'triplestore_url': SERVER,
'input_csv_dir': os.path.join(BASE_DIR, 'input_duplicate_venue'),
'base_output_dir': output_folder,
'output_rdf_dir': output_folder,
'resp_agent': 'test',
'base_iri': 'https://w3id.org/oc/meta/',
'context_path': None,
'dir_split_number': 10000,
'items_per_file': 1000,
'default_dir': '_',
'rdf_output_in_chunks': False,
'zip_output_rdf': True,
'source': None,
'supplier_prefix': '060',
'workers_number': 1,
'use_doi_api_service': False,
'blazegraph_full_text_search': False,
'virtuoso_full_text_search': True,
'fuseki_full_text_search': False,
'graphdb_connector_name': None,
'cache_endpoint': None,
'cache_update_endpoint': None,
'silencer': []
}

with open(meta_config_path, 'w') as f:
yaml.dump(settings, f)

# Run the process
run_meta_process(settings=settings, meta_config_path=meta_config_path)

# Query to check for duplicates - check all ISSNs
query = """
SELECT DISTINCT ?id ?value
WHERE {
?id <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> ?value ;
<http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
FILTER(STR(?value) IN ("1756-1833", "0959-8138", "0267-0623"))
}
"""
result = execute_sparql_query(SERVER, query, return_format=JSON)
# Group IDs by value to check for duplicates
ids_by_value = {}
for binding in result['results']['bindings']:
value = binding['value']['value']
id = binding['id']['value']
if value not in ids_by_value:
ids_by_value[value] = []
ids_by_value[value].append(id)

print(json.dumps(ids_by_value, indent=4))

# Cleanup
shutil.rmtree(output_folder, ignore_errors=True)
shutil.rmtree(os.path.join(BASE_DIR, 'input_duplicate_venue'), ignore_errors=True)
if os.path.exists(meta_config_path):
os.remove(meta_config_path)

# Check that we don't have duplicate IDs for any ISSN
for issn_value, ids in ids_by_value.items():
self.assertEqual(len(ids), 1,
f"Found multiple IDs for ISSN {issn_value} in venue: {ids}")

# Verify that pre-existing IDs were reused
self.assertTrue(
any("0601" in id for ids in ids_by_value.values() for id in ids) and
any("0602" in id for ids in ids_by_value.values() for id in ids),
"Pre-existing IDs were not reused"
)

def test_doi_with_multiple_slashes(self):
"""Test handling of DOIs containing multiple forward slashes"""
output_folder = os.path.join(BASE_DIR, 'output_doi_test')
meta_config_path = os.path.join(BASE_DIR, 'meta_config_doi.yaml')

# Setup: create test data with problematic DOI
os.makedirs(os.path.join(BASE_DIR, 'input_doi'), exist_ok=True)
with open(os.path.join(BASE_DIR, 'input_doi', 'test.csv'), 'w', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(["id", "title", "author", "pub_date", "venue", "volume", "issue", "page", "type", "publisher", "editor"])
writer.writerow([
"doi:10.1093/acprof:oso/9780199230723.001.0001", # Problematic DOI with multiple slashes
"Test Book",
"",
"",
"",
"",
"",
"",
"book",
"",
""
])

# Create test settings
settings = {
'triplestore_url': SERVER,
'input_csv_dir': os.path.join(BASE_DIR, 'input_doi'),
'base_output_dir': output_folder,
'output_rdf_dir': output_folder,
'resp_agent': 'test',
'base_iri': 'https://w3id.org/oc/meta/',
'context_path': None,
'dir_split_number': 10000,
'items_per_file': 1000,
'default_dir': '_',
'rdf_output_in_chunks': False,
'zip_output_rdf': True,
'source': None,
'supplier_prefix': '060',
'workers_number': 1,
'use_doi_api_service': False,
'blazegraph_full_text_search': False,
'virtuoso_full_text_search': True,
'fuseki_full_text_search': False,
'graphdb_connector_name': None,
'cache_endpoint': None,
'cache_update_endpoint': None,
'silencer': []
}

with open(meta_config_path, 'w') as f:
yaml.dump(settings, f)

now = datetime.now()

# Run the process
run_meta_process(settings=settings, meta_config_path=meta_config_path)

# Query to verify DOI was processed correctly
query = """
SELECT ?br ?id ?value
WHERE {
?id <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "10.1093/acprof:oso/9780199230723.001.0001"^^<http://www.w3.org/2001/XMLSchema#string> ;
<http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/doi> ;
^<http://purl.org/spar/datacite/hasIdentifier> ?br .
}
"""
result = execute_sparql_query(SERVER, query, return_format=JSON)

# Cleanup
shutil.rmtree(output_folder, ignore_errors=True)
shutil.rmtree(os.path.join(BASE_DIR, 'input_doi'), ignore_errors=True)
if os.path.exists(meta_config_path):
os.remove(meta_config_path)
delete_output_zip('.', now)

# Verify results
self.assertTrue(len(result['results']['bindings']) > 0,
"DOI with multiple slashes was not processed correctly")

# Check that we got exactly one result
self.assertEqual(len(result['results']['bindings']), 1,
f"Expected 1 result, got {len(result['results']['bindings'])}")

def normalize_graph(graph):
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"id","meta"
"doi:10.1001/jamapediatrics.2016.0073","060101"
"doi:2","060102"
"doi:10.1001/jamainternmed.2016.1384","0601"
"doi:10.1001/jamainternmed.2016.1384","060103"
"doi:10.1001/jama.2016.4932","060301"
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"id","meta"
"doi:10.1123/ijatt.2015-0070","06011"
"doi:10.1001/jama.2016.4932","060301"
"doi:10.1001/jamapediatrics.2016.0073","0601"
"doi:10.1001/jamapediatrics.2016.0073","060101"
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"id","meta"
"orcid:0000-0003-0530-4305","0602"
"crossref:1111","0603"
"orcid:0000-0003-0530-4305","0601"
"crossref:1111","0602"

0 comments on commit 88d5b57

Please sign in to comment.