Skip to content

Commit

Permalink
feat: Add support for temporary identifiers in meta process
Browse files Browse the repository at this point in the history
- Add handling of temporary identifiers (temp:*) for internal deduplication
- Generate OMIDs for resources with only temporary identifiers
- Ensure temporary identifiers are not saved in final output
- Add test case to verify temporary identifier handling:
  - Checks OMID generation when only temp ID is present
  - Verifies temp IDs are not persisted in triplestore
  - Tests deduplication using temp IDs
  • Loading branch information
arcangelo7 committed Feb 1, 2025
1 parent 704150c commit 856c49e
Show file tree
Hide file tree
Showing 3 changed files with 1,511 additions and 582 deletions.
9 changes: 8 additions & 1 deletion oc_meta/core/creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,10 @@ def __init__(
"wikidata",
"wikipedia",
}
self.schemas = self.ra_id_schemas.union(self.br_id_schemas)
self.temp_schema = {"temp"} # New schema for temporary identifiers
self.schemas = self.ra_id_schemas.union(self.br_id_schemas).union(
self.temp_schema
)

self.ra_index = self.indexer_id(ra_index)
self.br_index = self.indexer_id(br_index)
Expand Down Expand Up @@ -657,6 +660,10 @@ def __res_metaid(self, graph: BibliographicResource):

def id_creator(self, graph: BibliographicEntity, identifier: str, ra: bool) -> None:
new_id = None
# Skip temporary identifiers - they should not be saved in the final dataset
if identifier.startswith("temp:"):
return

if ra:
for ra_id_schema in self.ra_id_schemas:
if identifier.startswith(ra_id_schema):
Expand Down
12 changes: 11 additions & 1 deletion oc_meta/core/curator.py
Original file line number Diff line number Diff line change
Expand Up @@ -719,24 +719,34 @@ def clean_id_list(
metaid = ""
id_list = list(filter(None, id_list))
clean_list = list()
temp_ids = list() # List to store temporary identifiers

for elem in id_list:
if elem in clean_list:
continue
elem = Cleaner(elem).normalize_hyphens()
identifier = elem.split(":", 1)
schema = identifier[0].lower()
value = identifier[1]
if schema == "omid":

if schema == "temp":
# Store temporary identifiers separately for deduplication
temp_ids.append(value)
continue
elif schema == "omid":
metaid = value.replace(pattern, "")
else:
normalized_id = Cleaner(elem).normalize_id(
valid_dois_cache=valid_dois_cache
)
if normalized_id:
clean_list.append(normalized_id)

how_many_meta = [i for i in id_list if i.lower().startswith("omid")]
if len(how_many_meta) > 1:
clean_list = [i for i in clean_list if not i.lower().startswith("omid")]

# Use temporary IDs for deduplication but don't include them in clean_list
return clean_list, metaid

def conflict(
Expand Down
Loading

0 comments on commit 856c49e

Please sign in to comment.