Skip to content

Commit

Permalink
delete existing files on new zenodo deposit version; related to #304
Browse files Browse the repository at this point in the history
  • Loading branch information
Jorrit Poelen committed Sep 9, 2024
1 parent c46025e commit cc33b09
Show file tree
Hide file tree
Showing 23 changed files with 527 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ public abstract class CmdZenodoEnabled extends LoggingPersisting implements Runn
)
private String apiEndpoint = EnvUtil.getEnvironmentVariable(ZENODO_ENDPOINT, "https://zenodo.org");

public void setCreateNewVersionForExisting(Boolean createNewVersionForExisting) {
this.createNewVersionForExisting = createNewVersionForExisting;
}

@CommandLine.Option(
names = {"--new-version"},
description = "create new version if a Zenodo deposit with matching identifiers already exists"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import java.io.InputStream;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.TreeSet;
Expand Down Expand Up @@ -69,16 +70,25 @@ static ObjectMapper getObjectMapper() {

public static void delete(ZenodoContext ctx) throws IOException {
String deleteRequestURI = ctx.getEndpoint() + "/api/deposit/depositions/" + ctx.getDepositId();
try (InputStream inputStream = ResourcesHTTP.asInputStream(
RefNodeFactory.toIRI(deleteRequestURI),
new HttpDelete(URI.create(deleteRequestURI)),
ignoreProgress(),
ignoreNone()
)) {
try (InputStream inputStream = delete(deleteRequestURI)) {
updateContext(ctx, inputStream);
}
}

public static InputStream delete(String deleteRequestURI) throws IOException {
IRI dataURI = RefNodeFactory.toIRI(deleteRequestURI);
return delete(dataURI);
}

public static InputStream delete(IRI dataURI) throws IOException {
return ResourcesHTTP.asInputStream(
dataURI,
new HttpDelete(dataURI.getIRIString()),
ignoreProgress(),
ignoreNone()
);
}

public static ZenodoContext update(ZenodoContext ctx, String metadata) throws IOException {
String requestURI = ctx.getEndpoint() + "/api/deposit/depositions/" + ctx.getDepositId();
IRI dataURI = RefNodeFactory.toIRI(requestURI);
Expand Down Expand Up @@ -287,4 +297,20 @@ private static void executeQueryAndCollectIds(Collection<Pair<Long, String>> fou
private static IRI getQuery(String apiEndpoint, String filter, String method) {
return RefNodeFactory.toIRI((apiEndpoint + method) + "?" + filter);
}

public static List<IRI> getFileEndpoints(ZenodoContext ctx) {
JsonNode metadata = ctx.getMetadata();
JsonNode files = metadata == null ? new ObjectMapper().createArrayNode() : metadata.at("/files");
List<String> ids = new ArrayList<>();
for (JsonNode file : files) {
JsonNode at = file.at("/id");
if (!at.isMissingNode()) {
ids.add(at.asText());
}
}
return ids
.stream()
.map(id -> ctx.getEndpoint() + "/api/deposit/depositions/" + ctx.getDepositId() + "/files/" + id)
.map(RefNodeFactory::toIRI).collect(Collectors.toList());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
import bio.guoda.preston.stream.ContentStreamHandler;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.sun.xml.internal.ws.policy.privateutil.PolicyUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.NullOutputStream;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.rdf.api.IRI;
Expand All @@ -28,6 +31,7 @@
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.stream.Collectors;

import static bio.guoda.preston.zenodo.ZenodoUtils.delete;
import static bio.guoda.preston.zenodo.ZenodoUtils.getObjectMapper;

public class ZenodoMetadataFileStreamHandler implements ContentStreamHandler {
Expand Down Expand Up @@ -186,6 +190,7 @@ private void createNewVersion(IRI coordinate, JsonNode zenodoMetadata, List<Stri
} else if (existingIds.size() == 1 && ctx.createNewVersionForExisting()) {
ctxLocal.setDepositId(existingIds.get(0));
ctxLocal = ZenodoUtils.createNewVersion(ctxLocal);
deleteExistingContentIfPresent(ctxLocal);
String input = getObjectMapper().writer().writeValueAsString(zenodoMetadata);
ZenodoUtils.update(ctxLocal, input);
uploadContentAndPublish(zenodoMetadata, contentIds, ctxLocal);
Expand All @@ -208,6 +213,17 @@ private void createNewVersion(IRI coordinate, JsonNode zenodoMetadata, List<Stri
}
}

private void deleteExistingContentIfPresent(ZenodoContext ctxLocal) throws IOException {
List<IRI> fileEndpoints = ZenodoUtils.getFileEndpoints(ctxLocal);
for (IRI fileEndpoint : fileEndpoints) {
try (InputStream inputStream = delete(fileEndpoint)) {
IOUtils.copy(inputStream, NullOutputStream.INSTANCE);
} catch (IOException e) {
throw new IOException("failed to delete existing file [" + fileEndpoint.getIRIString() + "] for deposition [" + ctxLocal.getMetadata() + "]", e);
}
}
}

public static boolean hasAllowedPublicationDate(JsonNode zenodoMetadata, ZenodoConfig ctx) {
return !zenodoMetadata.at("/metadata/publication_date").isMissingNode()
|| ctx.shouldAllowEmptyPublicationDate();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package bio.guoda.preston.zenodo;

import bio.guoda.preston.HashType;
import bio.guoda.preston.RefNodeFactory;
import bio.guoda.preston.ResourcesHTTP;
import bio.guoda.preston.cmd.ZenodoMetaUtil;
Expand All @@ -22,6 +23,7 @@
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Matcher;
Expand All @@ -30,6 +32,7 @@
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.startsWith;


Expand Down Expand Up @@ -122,6 +125,79 @@ public void createDepositAndUpdateToRestrict() throws URISyntaxException, IOExce

}

@Test
public void createDepositAndCreateNewVersionWithDifferentFile() throws URISyntaxException, IOException {
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
String resourceOld = "darktaxon-delete/29/cc/29ccb90cb281069a23b4e07ec10583c8";
String fileChecksumOld = "md5:d3b07384d113edec49eaa6238ad5ff00";
publishNewVersionOfResource(outputStream, resourceOld);

assertStateOfDepositedFiles(outputStream, fileChecksumOld);

outputStream = new ByteArrayOutputStream();
String resourceNew = "darktaxon-delete/97/4c/974c37be498a377c3795bcbdb6e20581";
String fileChecksumNew = "md5:c157a79031e1c40f85931829bc5fc552";
publishNewVersionOfResource(outputStream, resourceNew);

assertStateOfDepositedFiles(outputStream, fileChecksumNew);



}

private void assertStateOfDepositedFiles(ByteArrayOutputStream outputStream, String fileChecksumOld) throws IOException {
String log = new String(outputStream.toByteArray(), StandardCharsets.UTF_8);
String[] split = StringUtils.split(log, '\n');
assertThat(split.length, greaterThan(0));

assertThat(log, containsString("https://sandbox.zenodo.org/records/"));

Pattern compile = Pattern.compile(".*https://sandbox.zenodo.org/records/(?<depositId>[0-9]+).*");
Matcher matcher = compile.matcher(StringUtils.replace(log, "\n", " "));
assertThat(matcher.matches(), Is.is(true));


String depositId = matcher.group("depositId");
JsonNode depositMetadata = requestDepositMetadata(depositId);
ZenodoContext ctx = new ZenodoContext("secret", "https://sandbox.zenodo.org");
ctx.setMetadata(depositMetadata);
ctx.setDepositId(Long.parseLong(depositId));
List<IRI> fileEndpoints = ZenodoUtils.getFileEndpoints(ctx);

assertThat(fileEndpoints.size(), is(1));

System.out.println(depositMetadata.toPrettyString());

JsonNode at = depositMetadata.at("/files/0/checksum");
assertThat(at.isMissingNode(), is(false));
assertThat(at.asText(), is(fileChecksumOld));
}

private void publishNewVersionOfResource(ByteArrayOutputStream outputStream, String resourceOld) throws IOException {
CmdZenodo cmdZenodo1 = new CmdZenodo();

URL resource = getClass().getResource(resourceOld);
URI remote = new File(resource.getFile()).getParentFile().getParentFile().getParentFile().toURI();


File dataDir = folder.newFolder();
cmdZenodo1.setLocalDataDir(dataDir.getAbsolutePath());
cmdZenodo1.setRemotes(Arrays.asList(remote));
cmdZenodo1.setHashType(HashType.md5);
cmdZenodo1.setApiEndpoint("https://sandbox.zenodo.org");
cmdZenodo1.setCreateNewVersionForExisting(true);

System.setProperty("ZENODO_TOKEN", ZenodoTestUtil.getAccessToken());

cmdZenodo1.setInputStream(getClass().getResourceAsStream(resourceOld));
cmdZenodo1.setOutputStream(outputStream);
cmdZenodo1.setCacheEnabled(false);

// first make sure a deposit exists
cmdZenodo1.run();
}


private CmdZenodo createCmd(ByteArrayOutputStream outputStream) throws IOException {
CmdZenodo cmdZenodo = new CmdZenodo();
String resourceURI = "batlit-data/31/31/3131a4dd8ed099a31e2f2032c0248ba7";
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
package bio.guoda.preston.zenodo;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.rdf.api.IRI;
import org.hamcrest.core.Is;
import org.junit.Test;

import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;

import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.MatcherAssert.assertThat;
Expand Down Expand Up @@ -58,4 +64,39 @@ public void queryForExistingRecords() {
);
}

@Test
public void extractFileIds() throws IOException {
ZenodoContext ctx = getContext();

InputStream is = getClass().getResourceAsStream("darktaxon-delete/to-be-deleted.json");


JsonNode jsonNode = new ObjectMapper().readTree(is);

ctx.setMetadata(jsonNode);
List<IRI> fileEndpoints = ZenodoUtils.getFileEndpoints(ctx);

assertThat(fileEndpoints.size(), Is.is(1));
assertThat(fileEndpoints.get(0).getIRIString(), Is.is("https://sandbox.example.org/api/deposit/depositions/123/files/ba1a0c80-d8a0-4afc-997b-4059b1c03ae5"));
}

private ZenodoContext getContext() {
ZenodoContext ctx = new ZenodoContext("bla", "https://sandbox.example.org");
ctx.setDepositId(123L);
return ctx;
}

@Test
public void extractFileIdsNoMatching() throws IOException {
InputStream is = getClass().getResourceAsStream("darktaxon-delete/to-be-deleted-no-files.json");

JsonNode jsonNode = new ObjectMapper().readTree(is);
ZenodoContext context = getContext();
context.setMetadata(jsonNode);
List<IRI> fileEndpoints = ZenodoUtils.getFileEndpoints(context);

assertThat(fileEndpoints.size(), Is.is(0));

}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<https://preston.guoda.bio> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/ns/prov#SoftwareAgent> <urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> .
<https://preston.guoda.bio> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/ns/prov#Agent> <urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> .
<https://preston.guoda.bio> <http://purl.org/dc/terms/description> "Preston is a software program that finds, archives and provides access to biodiversity datasets."@en <urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> .
<urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/ns/prov#Activity> <urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> .
<urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> <http://purl.org/dc/terms/description> "A crawl event that discovers biodiversity archives."@en <urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> .
<urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> <http://www.w3.org/ns/prov#startedAtTime> "2024-09-09T16:55:37.776Z"^^<http://www.w3.org/2001/XMLSchema#dateTime> <urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> .
<urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> <http://www.w3.org/ns/prov#wasStartedBy> <https://preston.guoda.bio> <urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> .
<https://doi.org/10.5281/zenodo.1410543> <http://www.w3.org/ns/prov#usedBy> <urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> <urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> .
<https://doi.org/10.5281/zenodo.1410543> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/dc/dcmitype/Software> <urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> .
<https://doi.org/10.5281/zenodo.1410543> <http://purl.org/dc/terms/bibliographicCitation> "Jorrit Poelen, Icaro Alzuru, & Michael Elliott. 2018-2024. Preston: a biodiversity dataset tracker (Version 0.9.7-SNAPSHOT) [Software]. Zenodo. https://doi.org/10.5281/zenodo.1410543"@en <urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> .
<urn:uuid:0659a54f-b713-4f86-a917-5be166a14110> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/ns/prov#Entity> <urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> .
<urn:uuid:0659a54f-b713-4f86-a917-5be166a14110> <http://purl.org/dc/terms/description> "A biodiversity dataset graph archive."@en <urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> .
<hash://md5/29ccb90cb281069a23b4e07ec10583c8> <http://www.w3.org/ns/prov#usedBy> <urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> <urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> .
<hash://md5/c157a79031e1c40f85931829bc5fc552> <http://www.w3.org/ns/prov#wasGeneratedBy> <urn:uuid:81682088-29a9-4d81-9704-0080b4ddf0a8> <urn:uuid:81682088-29a9-4d81-9704-0080b4ddf0a8> .
<hash://md5/c157a79031e1c40f85931829bc5fc552> <http://www.w3.org/ns/prov#qualifiedGeneration> <urn:uuid:81682088-29a9-4d81-9704-0080b4ddf0a8> <urn:uuid:81682088-29a9-4d81-9704-0080b4ddf0a8> .
<urn:uuid:81682088-29a9-4d81-9704-0080b4ddf0a8> <http://www.w3.org/ns/prov#generatedAtTime> "2024-09-09T16:55:37.843Z"^^<http://www.w3.org/2001/XMLSchema#dateTime> <urn:uuid:81682088-29a9-4d81-9704-0080b4ddf0a8> .
<urn:uuid:81682088-29a9-4d81-9704-0080b4ddf0a8> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/ns/prov#Generation> <urn:uuid:81682088-29a9-4d81-9704-0080b4ddf0a8> .
<urn:uuid:81682088-29a9-4d81-9704-0080b4ddf0a8> <http://www.w3.org/ns/prov#wasInformedBy> <urn:uuid:694e11f8-fbb5-449e-bedf-d9f40688d1c8> <urn:uuid:81682088-29a9-4d81-9704-0080b4ddf0a8> .
<urn:uuid:81682088-29a9-4d81-9704-0080b4ddf0a8> <http://www.w3.org/ns/prov#used> <urn:uuid:89d01646-f2c1-4cf7-aa0e-c3b154de6b74> <urn:uuid:81682088-29a9-4d81-9704-0080b4ddf0a8> .
<urn:uuid:89d01646-f2c1-4cf7-aa0e-c3b154de6b74> <http://purl.org/pav/hasVersion> <hash://md5/c157a79031e1c40f85931829bc5fc552> <urn:uuid:81682088-29a9-4d81-9704-0080b4ddf0a8> .
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
hash://md5/c8e5dd65f2bb814f6d5b3eab58281b55
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<https://preston.guoda.bio> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/ns/prov#SoftwareAgent> <urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> .
<https://preston.guoda.bio> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/ns/prov#Agent> <urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> .
<https://preston.guoda.bio> <http://purl.org/dc/terms/description> "Preston is a software program that finds, archives and provides access to biodiversity datasets."@en <urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> .
<urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/ns/prov#Activity> <urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> .
<urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> <http://purl.org/dc/terms/description> "A crawl event that discovers biodiversity archives."@en <urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> .
<urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> <http://www.w3.org/ns/prov#startedAtTime> "2024-09-09T16:55:31.567Z"^^<http://www.w3.org/2001/XMLSchema#dateTime> <urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> .
<urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> <http://www.w3.org/ns/prov#wasStartedBy> <https://preston.guoda.bio> <urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> .
<https://doi.org/10.5281/zenodo.1410543> <http://www.w3.org/ns/prov#usedBy> <urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> <urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> .
<https://doi.org/10.5281/zenodo.1410543> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/dc/dcmitype/Software> <urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> .
<https://doi.org/10.5281/zenodo.1410543> <http://purl.org/dc/terms/bibliographicCitation> "Jorrit Poelen, Icaro Alzuru, & Michael Elliott. 2018-2024. Preston: a biodiversity dataset tracker (Version 0.9.7-SNAPSHOT) [Software]. Zenodo. https://doi.org/10.5281/zenodo.1410543"@en <urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> .
<urn:uuid:0659a54f-b713-4f86-a917-5be166a14110> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/ns/prov#Entity> <urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> .
<urn:uuid:0659a54f-b713-4f86-a917-5be166a14110> <http://purl.org/dc/terms/description> "A biodiversity dataset graph archive."@en <urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> .
<hash://md5/c8e5dd65f2bb814f6d5b3eab58281b55> <http://www.w3.org/ns/prov#usedBy> <urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> <urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> .
<hash://md5/8465337f1fc30cf8078caf1dd69f55ca> <http://www.w3.org/ns/prov#wasGeneratedBy> <urn:uuid:e88efae0-4883-4727-9b1d-7f464d22ccdd> <urn:uuid:e88efae0-4883-4727-9b1d-7f464d22ccdd> .
<hash://md5/8465337f1fc30cf8078caf1dd69f55ca> <http://www.w3.org/ns/prov#qualifiedGeneration> <urn:uuid:e88efae0-4883-4727-9b1d-7f464d22ccdd> <urn:uuid:e88efae0-4883-4727-9b1d-7f464d22ccdd> .
<urn:uuid:e88efae0-4883-4727-9b1d-7f464d22ccdd> <http://www.w3.org/ns/prov#generatedAtTime> "2024-09-09T16:55:31.635Z"^^<http://www.w3.org/2001/XMLSchema#dateTime> <urn:uuid:e88efae0-4883-4727-9b1d-7f464d22ccdd> .
<urn:uuid:e88efae0-4883-4727-9b1d-7f464d22ccdd> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/ns/prov#Generation> <urn:uuid:e88efae0-4883-4727-9b1d-7f464d22ccdd> .
<urn:uuid:e88efae0-4883-4727-9b1d-7f464d22ccdd> <http://www.w3.org/ns/prov#wasInformedBy> <urn:uuid:e125f1df-603c-4ed5-aa13-2acd0044903c> <urn:uuid:e88efae0-4883-4727-9b1d-7f464d22ccdd> .
<urn:uuid:e88efae0-4883-4727-9b1d-7f464d22ccdd> <http://www.w3.org/ns/prov#used> <urn:uuid:8e3a9d0b-4b99-4e03-b961-b0359754da8f> <urn:uuid:e88efae0-4883-4727-9b1d-7f464d22ccdd> .
<urn:uuid:8e3a9d0b-4b99-4e03-b961-b0359754da8f> <http://purl.org/pav/hasVersion> <hash://md5/8465337f1fc30cf8078caf1dd69f55ca> <urn:uuid:e88efae0-4883-4727-9b1d-7f464d22ccdd> .
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
hash://md5/974c37be498a377c3795bcbdb6e20581
Loading

0 comments on commit cc33b09

Please sign in to comment.