diff --git a/doc/release-notes/9130-cleanup-storage.md b/doc/release-notes/9130-cleanup-storage.md new file mode 100644 index 00000000000..052123d5dd5 --- /dev/null +++ b/doc/release-notes/9130-cleanup-storage.md @@ -0,0 +1,3 @@ +### Support for cleaning up files in datasets' storage + +Experimental feature: the leftover files stored in the Dataset storage location that are not in the file list of that Dataset, but are named following the Dataverse technical convetion for dataset files, can be removed with the new native API call [Cleanup storage of a Dataset](https://guides.dataverse.org/en/latest/api/native-api.html#cleanup-storage-api). \ No newline at end of file diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst index f662eaa7a61..f67c5a45174 100644 --- a/doc/sphinx-guides/source/api/native-api.rst +++ b/doc/sphinx-guides/source/api/native-api.rst @@ -1513,6 +1513,38 @@ The fully expanded example above (without environment variables) looks like this curl -H X-Dataverse-key: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx -X POST https://demo.dataverse.org/api/datasets/:persistentId/add?persistentId=doi:10.5072/FK2/J8SJZB -F 'jsonData={"description":"A remote image.","storageIdentifier":"trsa://themes/custom/qdr/images/CoreTrustSeal-logo-transparent.png","checksumType":"MD5","md5Hash":"509ef88afa907eaf2c17c1c8d8fde77e","label":"testlogo.png","fileName":"testlogo.png","mimeType":"image/png"}' +.. _cleanup-storage-api: + +Cleanup storage of a Dataset +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is an experimental feature and should be tested on your system before using it in production. +Also, make sure that your backups are up-to-date before using this on production servers. +It is advised to first call this method with the ``dryrun`` parameter set to ``true`` before actually deleting the files. +This will allow you to manually inspect the files that would be deleted if that parameter is set to ``false`` or is omitted (a list of the files that would be deleted is provided in the response). + +If your Dataverse installation has been configured to support direct uploads, or in some other situations, +you could end up with some files in the storage of a dataset that are not linked to that dataset directly. Most commonly, this could +happen when an upload fails in the middle of a transfer, i.e. if a user does a UI direct upload and leaves the page without hitting cancel or save, +Dataverse doesn't know and doesn't clean up the files. Similarly in the direct upload API, if the final /addFiles call isn't done, the files are abandoned. + +All the files stored in the Dataset storage location that are not in the file list of that Dataset (and follow the naming pattern of the dataset files) can be removed, as shown in the example below. + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export PERSISTENT_ID=doi:10.5072/FK2/J8SJZB + export DRYRUN=true + + curl -H "X-Dataverse-key: $API_TOKEN" -X GET "$SERVER_URL/api/datasets/:persistentId/cleanStorage?persistentId=$PERSISTENT_ID&dryrun=$DRYRUN" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H X-Dataverse-key: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx -X GET https://demo.dataverse.org/api/datasets/:persistentId/cleanStorage?persistentId=doi:10.5072/FK2/J8SJZB&dryrun=true + Adding Files To a Dataset via Other Tools ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index c3d262a20db..0bb6eebb80b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -67,6 +67,7 @@ import edu.harvard.iq.dataverse.dataaccess.DataAccess; import edu.harvard.iq.dataverse.dataaccess.ImageThumbConverter; import edu.harvard.iq.dataverse.dataaccess.S3AccessIO; +import edu.harvard.iq.dataverse.dataaccess.StorageIO; import edu.harvard.iq.dataverse.engine.command.exception.CommandException; import edu.harvard.iq.dataverse.engine.command.exception.UnforcedCommandException; import edu.harvard.iq.dataverse.engine.command.impl.GetDatasetStorageSizeCommand; @@ -114,11 +115,13 @@ import java.time.LocalDateTime; import java.util.*; import java.util.concurrent.*; +import java.util.function.Predicate; import java.time.ZoneId; import java.time.format.DateTimeFormatter; import java.util.Map.Entry; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.regex.Pattern; import java.util.stream.Collectors; import javax.ejb.EJB; @@ -155,6 +158,7 @@ public class Datasets extends AbstractApiBean { private static final Logger logger = Logger.getLogger(Datasets.class.getCanonicalName()); + private static final Pattern dataFilePattern = Pattern.compile("^[0-9a-f]{11}-[0-9a-f]{12}\\.?.*"); @Inject DataverseSession session; @@ -2502,6 +2506,76 @@ public Response addFileToDataset(@PathParam("id") String idSupplied, } // end: addFileToDataset + /** + * Clean storage of a Dataset + * + * @param idSupplied + * @return + */ + @GET + @Path("{id}/cleanStorage") + public Response cleanStorage(@PathParam("id") String idSupplied, @QueryParam("dryrun") Boolean dryrun) { + // get user and dataset + User authUser; + try { + authUser = findUserOrDie(); + } catch (WrappedResponse ex) { + return error(Response.Status.FORBIDDEN, + BundleUtil.getStringFromBundle("file.addreplace.error.auth") + ); + } + + Dataset dataset; + try { + dataset = findDatasetOrDie(idSupplied); + } catch (WrappedResponse wr) { + return wr.getResponse(); + } + + // check permissions + if (!permissionSvc.permissionsFor(createDataverseRequest(authUser), dataset).contains(Permission.EditDataset)) { + return error(Response.Status.INTERNAL_SERVER_ERROR, "Access denied!"); + } + + boolean doDryRun = dryrun != null && dryrun.booleanValue(); + + // check if no legacy files are present + Set datasetFilenames = getDatasetFilenames(dataset); + if (datasetFilenames.stream().anyMatch(x -> !dataFilePattern.matcher(x).matches())) { + logger.log(Level.WARNING, "Dataset contains legacy files not matching the naming pattern!"); + } + + Predicate filter = getToDeleteFilesFilter(datasetFilenames); + List deleted; + try { + StorageIO datasetIO = DataAccess.getStorageIO(dataset); + deleted = datasetIO.cleanUp(filter, doDryRun); + } catch (IOException ex) { + logger.log(Level.SEVERE, null, ex); + return error(Response.Status.INTERNAL_SERVER_ERROR, "IOException! Serious Error! See administrator!"); + } + + return ok("Found: " + datasetFilenames.stream().collect(Collectors.joining(", ")) + "\n" + "Deleted: " + deleted.stream().collect(Collectors.joining(", "))); + + } + + private static Set getDatasetFilenames(Dataset dataset) { + Set files = new HashSet<>(); + for (DataFile dataFile: dataset.getFiles()) { + String storageIdentifier = dataFile.getStorageIdentifier(); + String location = storageIdentifier.substring(storageIdentifier.indexOf("://") + 3); + String[] locationParts = location.split(":");//separate bucket, swift container, etc. from fileName + files.add(locationParts[locationParts.length-1]); + } + return files; + } + + public static Predicate getToDeleteFilesFilter(Set datasetFilenames) { + return f -> { + return dataFilePattern.matcher(f).matches() && datasetFilenames.stream().noneMatch(x -> f.startsWith(x)); + }; + } + private void msg(String m) { //System.out.println(m); logger.fine(m); diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java index d5f00b9868f..8ee3f0cf53c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/FileAccessIO.java @@ -33,9 +33,11 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.List; +import java.util.function.Predicate; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; // Dataverse imports: import edu.harvard.iq.dataverse.DataFile; @@ -683,4 +685,56 @@ protected static boolean isValidIdentifier(String driverId, String storageId) { } return true; } + + private List listAllFiles() throws IOException { + Dataset dataset = this.getDataset(); + if (dataset == null) { + throw new IOException("This FileAccessIO object hasn't been properly initialized."); + } + + Path datasetDirectoryPath = Paths.get(dataset.getAuthorityForFileStorage(), dataset.getIdentifierForFileStorage()); + if (datasetDirectoryPath == null) { + throw new IOException("Could not determine the filesystem directory of the dataset."); + } + + DirectoryStream dirStream = Files.newDirectoryStream(Paths.get(this.getFilesRootDirectory(), datasetDirectoryPath.toString())); + + List res = new ArrayList<>(); + if (dirStream != null) { + for (Path filePath : dirStream) { + res.add(filePath.getFileName().toString()); + } + dirStream.close(); + } + + return res; + } + + private void deleteFile(String fileName) throws IOException { + Dataset dataset = this.getDataset(); + if (dataset == null) { + throw new IOException("This FileAccessIO object hasn't been properly initialized."); + } + + Path datasetDirectoryPath = Paths.get(dataset.getAuthorityForFileStorage(), dataset.getIdentifierForFileStorage()); + if (datasetDirectoryPath == null) { + throw new IOException("Could not determine the filesystem directory of the dataset."); + } + + Path p = Paths.get(this.getFilesRootDirectory(), datasetDirectoryPath.toString(), fileName); + Files.delete(p); + } + + @Override + public List cleanUp(Predicate filter, boolean dryRun) throws IOException { + List toDelete = this.listAllFiles().stream().filter(filter).collect(Collectors.toList()); + if (dryRun) { + return toDelete; + } + for (String f : toDelete) { + this.deleteFile(f); + } + return toDelete; + } + } diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/InputStreamIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/InputStreamIO.java index c9796d24b27..be6f9df0254 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/InputStreamIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/InputStreamIO.java @@ -14,6 +14,7 @@ import java.nio.channels.WritableByteChannel; import java.nio.file.Path; import java.util.List; +import java.util.function.Predicate; import java.util.logging.Logger; /** @@ -159,5 +160,9 @@ public void revertBackupAsAux(String auxItemTag) throws IOException { throw new UnsupportedDataAccessOperationException("InputStreamIO: this method is not supported in this DataAccess driver."); } - + @Override + public List cleanUp(Predicate filter, boolean dryRun) throws IOException { + throw new UnsupportedDataAccessOperationException("InputStreamIO: tthis method is not supported in this DataAccess driver."); + } + } diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/RemoteOverlayAccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/RemoteOverlayAccessIO.java index c8e42349318..66c6a4cc2ee 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/RemoteOverlayAccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/RemoteOverlayAccessIO.java @@ -24,6 +24,7 @@ import java.security.KeyStoreException; import java.security.NoSuchAlgorithmException; import java.util.List; +import java.util.function.Predicate; import java.util.logging.Logger; import org.apache.http.Header; @@ -630,5 +631,9 @@ protected static boolean isValidIdentifier(String driverId, String storageId) { public static String getBaseStoreIdFor(String driverId) { return System.getProperty("dataverse.files." + driverId + ".base-store"); } - + + @Override + public List cleanUp(Predicate filter, boolean dryRun) throws IOException { + return baseStore.cleanUp(filter, dryRun); + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java index 3c9cef04980..f396b07d788 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/S3AccessIO.java @@ -60,7 +60,10 @@ import java.util.HashMap; import java.util.List; import java.util.Random; +import java.util.function.Predicate; import java.util.logging.Logger; +import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.eclipse.microprofile.config.Config; import org.eclipse.microprofile.config.ConfigProvider; @@ -1306,5 +1309,75 @@ protected static boolean isValidIdentifier(String driverId, String storageId) { return true; } + private List listAllFiles() throws IOException { + if (!this.canWrite()) { + open(); + } + Dataset dataset = this.getDataset(); + if (dataset == null) { + throw new IOException("This S3AccessIO object hasn't been properly initialized."); + } + String prefix = dataset.getAuthorityForFileStorage() + "/" + dataset.getIdentifierForFileStorage() + "/"; + + List ret = new ArrayList<>(); + ListObjectsRequest req = new ListObjectsRequest().withBucketName(bucketName).withPrefix(prefix); + ObjectListing storedFilesList = null; + try { + storedFilesList = s3.listObjects(req); + } catch (SdkClientException sce) { + throw new IOException ("S3 listObjects: failed to get a listing for " + prefix); + } + if (storedFilesList == null) { + return ret; + } + List storedFilesSummary = storedFilesList.getObjectSummaries(); + try { + while (storedFilesList.isTruncated()) { + logger.fine("S3 listObjects: going to next page of list"); + storedFilesList = s3.listNextBatchOfObjects(storedFilesList); + if (storedFilesList != null) { + storedFilesSummary.addAll(storedFilesList.getObjectSummaries()); + } + } + } catch (AmazonClientException ase) { + //logger.warning("Caught an AmazonServiceException in S3AccessIO.listObjects(): " + ase.getMessage()); + throw new IOException("S3AccessIO: Failed to get objects for listing."); + } -} + for (S3ObjectSummary item : storedFilesSummary) { + String fileName = item.getKey().substring(prefix.length()); + ret.add(fileName); + } + return ret; + } + + private void deleteFile(String fileName) throws IOException { + if (!this.canWrite()) { + open(); + } + Dataset dataset = this.getDataset(); + if (dataset == null) { + throw new IOException("This S3AccessIO object hasn't been properly initialized."); + } + String prefix = dataset.getAuthorityForFileStorage() + "/" + dataset.getIdentifierForFileStorage() + "/"; + + try { + DeleteObjectRequest dor = new DeleteObjectRequest(bucketName, prefix + fileName); + s3.deleteObject(dor); + } catch (AmazonClientException ase) { + logger.warning("S3AccessIO: Unable to delete object " + ase.getMessage()); + } + } + + @Override + public List cleanUp(Predicate filter, boolean dryRun) throws IOException { + List toDelete = this.listAllFiles().stream().filter(filter).collect(Collectors.toList()); + if (dryRun) { + return toDelete; + } + for (String f : toDelete) { + this.deleteFile(f); + } + return toDelete; + } +} \ No newline at end of file diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/StorageIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/StorageIO.java index 90e4a54dbe8..bfd5c5f0d8f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/StorageIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/StorageIO.java @@ -39,6 +39,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.function.Predicate; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -622,4 +623,6 @@ protected static boolean usesStandardNamePattern(String identifier) { return m.find(); } + public abstract List cleanUp(Predicate filter, boolean dryRun) throws IOException; + } diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/SwiftAccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/SwiftAccessIO.java index b1725b040a3..6c84009de3e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/SwiftAccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/SwiftAccessIO.java @@ -22,7 +22,10 @@ import java.util.Formatter; import java.util.List; import java.util.Properties; +import java.util.function.Predicate; import java.util.logging.Logger; +import java.util.stream.Collectors; + import javax.crypto.Mac; import javax.crypto.spec.SecretKeySpec; import org.javaswift.joss.client.factory.AccountFactory; @@ -864,13 +867,16 @@ public InputStream getAuxFileAsInputStream(String auxItemTag) throws IOException } } + private String getSwiftContainerName(Dataset dataset) { + String authorityNoSlashes = dataset.getAuthorityForFileStorage().replace("/", swiftFolderPathSeparator); + return dataset.getProtocolForFileStorage() + swiftFolderPathSeparator + authorityNoSlashes.replace(".", swiftFolderPathSeparator) + + swiftFolderPathSeparator + dataset.getIdentifierForFileStorage(); + } + @Override public String getSwiftContainerName() { if (dvObject instanceof DataFile) { - String authorityNoSlashes = this.getDataFile().getOwner().getAuthorityForFileStorage().replace("/", swiftFolderPathSeparator); - return this.getDataFile().getOwner().getProtocolForFileStorage() + swiftFolderPathSeparator - + authorityNoSlashes.replace(".", swiftFolderPathSeparator) + - swiftFolderPathSeparator + this.getDataFile().getOwner().getIdentifierForFileStorage(); + return getSwiftContainerName(this.getDataFile().getOwner()); } return null; } @@ -893,5 +899,59 @@ public static String calculateRFC2104HMAC(String data, String key) mac.init(signingKey); return toHexString(mac.doFinal(data.getBytes())); } - + + private List listAllFiles() throws IOException { + if (!this.canWrite()) { + open(DataAccessOption.WRITE_ACCESS); + } + Dataset dataset = this.getDataset(); + if (dataset == null) { + throw new IOException("This SwiftAccessIO object hasn't been properly initialized."); + } + String prefix = getSwiftContainerName(dataset) + swiftFolderPathSeparator; + + Collection items; + String lastItemName = null; + List ret = new ArrayList<>(); + + while ((items = this.swiftContainer.list(prefix, lastItemName, LIST_PAGE_LIMIT)) != null && items.size() > 0) { + for (StoredObject item : items) { + lastItemName = item.getName().substring(prefix.length()); + ret.add(lastItemName); + } + } + + return ret; + } + + private void deleteFile(String fileName) throws IOException { + if (!this.canWrite()) { + open(DataAccessOption.WRITE_ACCESS); + } + Dataset dataset = this.getDataset(); + if (dataset == null) { + throw new IOException("This SwiftAccessIO object hasn't been properly initialized."); + } + String prefix = getSwiftContainerName(dataset) + swiftFolderPathSeparator; + + StoredObject fileObject = this.swiftContainer.getObject(prefix + fileName); + + if (!fileObject.exists()) { + throw new FileNotFoundException("SwiftAccessIO/Direct Access: " + fileName + " does not exist"); + } + + fileObject.delete(); + } + + @Override + public List cleanUp(Predicate filter, boolean dryRun) throws IOException { + List toDelete = this.listAllFiles().stream().filter(filter).collect(Collectors.toList()); + if (dryRun) { + return toDelete; + } + for (String f : toDelete) { + this.deleteFile(f); + } + return toDelete; + } } diff --git a/src/test/java/edu/harvard/iq/dataverse/api/DatasetsTest.java b/src/test/java/edu/harvard/iq/dataverse/api/DatasetsTest.java new file mode 100644 index 00000000000..fded590d9db --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/api/DatasetsTest.java @@ -0,0 +1,58 @@ +package edu.harvard.iq.dataverse.api; + +import org.junit.Test; +import java.util.HashSet; +import java.util.Set; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class DatasetsTest { + + /** + * Test cleanup filter + */ + @Test + public void testCleanup() { + Set datasetFiles = new HashSet<>() { + { + add("1837fda0b6c-90779481d439"); + add("1837fda0e17-4b0926f6d44e"); + add("1837fda1b80-46a899909269"); + } + }; + Set filesOnDrive = new HashSet<>() { + { + add("1837fda0b6c-90779481d439"); + add("1837fda0e17-4b0926f6d44e"); + add("1837fda1b80-46a899909269"); + add("prefix_1837fda0b6c-90779481d439"); + add("1837fda0e17-4b0926f6d44e_suffix"); + add("1837fda1b80-extra-46a899909269"); + add("1837fda0e17-4b0926f6d44e.aux"); + add("1837fda1994-5f74d57e6e47"); + add("1837fda17ce-d7b9987fc6e9"); + add("18383198c49-aeda08ccffff"); + add("prefix_1837fda1994-5f74d57e6e47"); + add("1837fda17ce-d7b9987fc6e9_suffix"); + add("18383198c49-extra-aeda08ccffff"); + add("some_other_file"); + add("1837fda17ce-d7b9987fc6e9.aux"); + add("18383198c49.aeda08ccffff"); + add("1837fda17ce-d7b9987fc6xy"); + } + }; + + Predicate toDeleteFilesFilter = Datasets.getToDeleteFilesFilter(datasetFiles); + Set deleted = filesOnDrive.stream().filter(toDeleteFilesFilter).collect(Collectors.toSet()); + + assertEquals(5, deleted.size()); + assertTrue(deleted.contains("1837fda1994-5f74d57e6e47")); + assertTrue(deleted.contains("1837fda17ce-d7b9987fc6e9")); + assertTrue(deleted.contains("18383198c49-aeda08ccffff")); + assertTrue(deleted.contains("1837fda17ce-d7b9987fc6e9_suffix")); + assertTrue(deleted.contains("1837fda17ce-d7b9987fc6e9.aux")); + } +}