diff --git a/README.md b/README.md index 490c3f20..632442f5 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ new OCFL repositories, but is not required when opening an existing repository. OCFL extension. Currently, the following extensions are implemented: * [0002-flat-direct-storage-layout](https://ocfl.github.io/extensions/0002-flat-direct-storage-layout.html): `FlatLayoutConfig` * [0003-hash-and-id-n-tuple-storage-layout](https://ocfl.github.io/extensions/0003-hash-and-id-n-tuple-storage-layout.html): `HashedNTupleIdEncapsulationLayoutConfig` - * [0004-hashed-n-tuple-storage-layout](https://github.com/OCFL/extensions/pull/16): `HashedNTupleLayoutConfig` + * [0004-hashed-n-tuple-storage-layout](https://ocfl.github.io/extensions/0004-hashed-n-tuple-storage-layout.html): `HashedNTupleLayoutConfig` ### Optional Properties @@ -284,7 +284,7 @@ The following is a list of currently supported storage layout extensions: * [0003-hash-and-id-n-tuple-storage-layout](https://ocfl.github.io/extensions/0003-hash-and-id-n-tuple-storage-layout.html) * Configuration class: `HashedNTupleIdEncapsulationLayoutConfig` * Implementation class: `HashedNTupleIdEncapsulationLayoutExtension` -* [0004-hashed-n-tuple-storage-layout](https://github.com/OCFL/extensions/pull/16) +* [0004-hashed-n-tuple-storage-layout](https://ocfl.github.io/extensions/0004-hashed-n-tuple-storage-layout.html) * Configuration class: `HashedNTupleLayoutConfig` * Implementation class: `HashedNTupleLayoutExtension` diff --git a/ocfl-java-api/src/main/java/edu/wisc/library/ocfl/api/OcflConstants.java b/ocfl-java-api/src/main/java/edu/wisc/library/ocfl/api/OcflConstants.java index fbcc5145..4efe9c3d 100644 --- a/ocfl-java-api/src/main/java/edu/wisc/library/ocfl/api/OcflConstants.java +++ b/ocfl-java-api/src/main/java/edu/wisc/library/ocfl/api/OcflConstants.java @@ -52,7 +52,7 @@ private OcflConstants() { public static final DigestAlgorithm DEFAULT_DIGEST_ALGORITHM = DigestAlgorithm.sha512; public static final Set ALLOWED_DIGEST_ALGORITHMS = Set.of(DigestAlgorithm.sha512, DigestAlgorithm.sha256); - public static final String MUTABLE_HEAD_EXT_PATH = EXTENSIONS_DIR + "/0004-mutable-head"; + public static final String MUTABLE_HEAD_EXT_PATH = EXTENSIONS_DIR + "/0005-mutable-head"; public static final String MUTABLE_HEAD_VERSION_PATH = MUTABLE_HEAD_EXT_PATH + "/head"; public static final String MUTABLE_HEAD_REVISIONS_PATH = MUTABLE_HEAD_EXT_PATH + "/revisions"; diff --git a/ocfl-java-aws/src/main/java/edu/wisc/library/ocfl/aws/OcflS3Client.java b/ocfl-java-aws/src/main/java/edu/wisc/library/ocfl/aws/OcflS3Client.java index 1cb0752a..c504e9d5 100644 --- a/ocfl-java-aws/src/main/java/edu/wisc/library/ocfl/aws/OcflS3Client.java +++ b/ocfl-java-aws/src/main/java/edu/wisc/library/ocfl/aws/OcflS3Client.java @@ -72,6 +72,7 @@ import java.util.Arrays; import java.util.Collection; import java.util.List; +import java.util.Objects; import java.util.function.BiConsumer; import java.util.stream.Collectors; @@ -489,6 +490,7 @@ public void deletePath(String path) { public void deleteObjects(Collection objectPaths) { if (!objectPaths.isEmpty()) { var objectKeys = objectPaths.stream() + .filter(Objects::nonNull) .map(keyBuilder::buildFromPath) .collect(Collectors.toList()); diff --git a/ocfl-java-aws/src/test/java/edu/wisc/library/ocfl/aws/OcflS3Test.java b/ocfl-java-aws/src/test/java/edu/wisc/library/ocfl/aws/OcflS3Test.java index 9f51fafe..65bbf71b 100644 --- a/ocfl-java-aws/src/test/java/edu/wisc/library/ocfl/aws/OcflS3Test.java +++ b/ocfl-java-aws/src/test/java/edu/wisc/library/ocfl/aws/OcflS3Test.java @@ -67,12 +67,12 @@ public void basicMutableHeadTest(String repoPrefix) { "inventory.json.sha512", "v1/inventory.json", "v1/inventory.json.sha512", - "extensions/0004-mutable-head/root-inventory.json.sha512", - "extensions/0004-mutable-head/revisions/r1", - "extensions/0004-mutable-head/head/inventory.json", - "extensions/0004-mutable-head/head/inventory.json.sha512", - "extensions/0004-mutable-head/head/content/r1/dir/file1.txt", - "extensions/0004-mutable-head/head/content/r1/dir/sub/file2.txt" + "extensions/0005-mutable-head/root-inventory.json.sha512", + "extensions/0005-mutable-head/revisions/r1", + "extensions/0005-mutable-head/head/inventory.json", + "extensions/0005-mutable-head/head/inventory.json.sha512", + "extensions/0005-mutable-head/head/content/r1/dir/file1.txt", + "extensions/0005-mutable-head/head/content/r1/dir/sub/file2.txt" )); assertEquals("file1", streamToString(repo.getObject(ObjectVersionId.head(objectId)).getFile("dir/file1.txt").getStream())); diff --git a/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/extension/storage/layout/HashedNTupleLayoutExtension.java b/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/extension/storage/layout/HashedNTupleLayoutExtension.java index bd7dcdfd..7888ef9e 100644 --- a/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/extension/storage/layout/HashedNTupleLayoutExtension.java +++ b/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/extension/storage/layout/HashedNTupleLayoutExtension.java @@ -31,9 +31,8 @@ import edu.wisc.library.ocfl.core.util.DigestUtil; /** - * Implementation of the Hashed Truncated N-tuple Trees for OCFL Storage Hierarchies extension. - * - * TODO add link to spec when finalized + * Implementation of the + * Hashed N-tuple Storage Layout extension. */ public class HashedNTupleLayoutExtension implements OcflStorageLayoutExtension { diff --git a/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/extension/storage/layout/config/FlatLayoutConfig.java b/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/extension/storage/layout/config/FlatLayoutConfig.java index 0d8d7895..e7e83579 100644 --- a/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/extension/storage/layout/config/FlatLayoutConfig.java +++ b/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/extension/storage/layout/config/FlatLayoutConfig.java @@ -31,9 +31,8 @@ import java.util.Objects; /** - * Configuration for the Flat Storage Layout extension. - * - * TODO Add link to spec when finalized + * Configuration for the + * Flat Direct Storage Layout extension. */ public class FlatLayoutConfig implements OcflExtensionConfig { diff --git a/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/extension/storage/layout/config/HashedNTupleLayoutConfig.java b/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/extension/storage/layout/config/HashedNTupleLayoutConfig.java index 77431411..9a85c1da 100644 --- a/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/extension/storage/layout/config/HashedNTupleLayoutConfig.java +++ b/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/extension/storage/layout/config/HashedNTupleLayoutConfig.java @@ -35,9 +35,8 @@ import java.util.Objects; /** - * Configuration for the Hashed Truncated N-tuple Trees for OCFL Storage Hierarchies extension. - * - * TODO Add link to spec when finalized + * Configuration for the + * Hashed N-tuple Storage Layout extension. */ public class HashedNTupleLayoutConfig implements OcflExtensionConfig { diff --git a/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/storage/cloud/CloudOcflStorageInitializer.java b/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/storage/cloud/CloudOcflStorageInitializer.java index 1350d0d1..9a20e7c4 100644 --- a/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/storage/cloud/CloudOcflStorageInitializer.java +++ b/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/storage/cloud/CloudOcflStorageInitializer.java @@ -58,6 +58,7 @@ public class CloudOcflStorageInitializer { private static final Logger LOG = LoggerFactory.getLogger(CloudOcflStorageInitializer.class); + private static final String SPECS_DIR = "specs/"; private static final String MEDIA_TYPE_TEXT = "text/plain; charset=UTF-8"; private static final String MEDIA_TYPE_JSON = "application/json; charset=UTF-8"; private static final String OBJECT_MARKER_PREFIX = "0=ocfl_object"; @@ -209,6 +210,7 @@ private OcflStorageLayoutExtension initNewRepo(OcflVersion ocflVersion, OcflExte keys.add(writeNamasteFile(ocflVersion)); keys.add(writeOcflSpec(ocflVersion)); keys.addAll(writeOcflLayout(layoutConfig, layoutExtension.getDescription())); + keys.add(writeOcflLayoutSpec(layoutConfig)); return layoutExtension; } catch (RuntimeException e) { LOG.error("Failed to initialize OCFL repository", e); @@ -218,9 +220,21 @@ private OcflStorageLayoutExtension initNewRepo(OcflVersion ocflVersion, OcflExte } private String writeOcflSpec(OcflVersion ocflVersion) { - var ocflSpecFile = ocflVersion.getOcflVersion() + ".txt"; - try (var ocflSpecStream = this.getClass().getClassLoader().getResourceAsStream(ocflSpecFile)) { - return uploadStream(ocflSpecFile, ocflSpecStream).getPath(); + return writeSpecFile(ocflVersion.getOcflVersion() + ".txt"); + } + + private String writeOcflLayoutSpec(OcflExtensionConfig layoutConfig) { + try { + return writeSpecFile(layoutConfig.getExtensionName() + ".md"); + } catch (RuntimeException e) { + LOG.warn("Failed to write spec file for layout extension {}", layoutConfig.getExtensionName(), e); + return null; + } + } + + private String writeSpecFile(String fileName) { + try (var stream = this.getClass().getClassLoader().getResourceAsStream(SPECS_DIR + fileName)) { + return uploadStream(fileName, stream).getPath(); } catch (IOException e) { throw new OcflIOException(e); } diff --git a/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/storage/filesystem/FileSystemOcflStorageInitializer.java b/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/storage/filesystem/FileSystemOcflStorageInitializer.java index ed5d0fc9..cb01e2a0 100644 --- a/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/storage/filesystem/FileSystemOcflStorageInitializer.java +++ b/ocfl-java-core/src/main/java/edu/wisc/library/ocfl/core/storage/filesystem/FileSystemOcflStorageInitializer.java @@ -60,6 +60,7 @@ public class FileSystemOcflStorageInitializer { private static final Logger LOG = LoggerFactory.getLogger(FileSystemOcflStorageInitializer.class); + private static final String SPECS_DIR = "specs/"; private static final String OBJECT_MARKER_PREFIX = "0=ocfl_object"; private final ObjectMapper objectMapper; @@ -222,6 +223,7 @@ private OcflStorageLayoutExtension initNewRepo(Path repositoryRoot, OcflVersion new NamasteTypeFile(ocflVersion.getOcflVersion()).writeFile(repositoryRoot); writeOcflSpec(repositoryRoot, ocflVersion); writeOcflLayout(repositoryRoot, layoutConfig, layoutExtension.getDescription()); + writeOcflLayoutSpec(repositoryRoot, layoutConfig); return layoutExtension; } catch (RuntimeException e) { LOG.error("Failed to initialize OCFL repository at {}", repositoryRoot, e); @@ -232,8 +234,21 @@ private OcflStorageLayoutExtension initNewRepo(Path repositoryRoot, OcflVersion private void writeOcflSpec(Path repositoryRoot, OcflVersion ocflVersion) { var ocflSpecFile = ocflVersion.getOcflVersion() + ".txt"; - try (var ocflSpecStream = this.getClass().getClassLoader().getResourceAsStream(ocflSpecFile)) { - Files.copy(ocflSpecStream, repositoryRoot.resolve(ocflSpecFile)); + writeSpecFile(repositoryRoot, ocflSpecFile); + } + + private void writeOcflLayoutSpec(Path repositoryRoot, OcflExtensionConfig layoutConfig) { + var specFile = layoutConfig.getExtensionName() + ".md"; + try { + writeSpecFile(repositoryRoot, specFile); + } catch (RuntimeException e) { + LOG.warn("Failed to write spec file for layout extension {}", layoutConfig.getExtensionName(), e); + } + } + + private void writeSpecFile(Path repositoryRoot, String fileName) { + try (var stream = this.getClass().getClassLoader().getResourceAsStream(SPECS_DIR + fileName)) { + Files.copy(stream, repositoryRoot.resolve(fileName)); } catch (IOException e) { throw new OcflIOException(e); } diff --git a/ocfl-java-core/src/main/resources/specs/0002-flat-direct-storage-layout.md b/ocfl-java-core/src/main/resources/specs/0002-flat-direct-storage-layout.md new file mode 100644 index 00000000..5acd6a68 --- /dev/null +++ b/ocfl-java-core/src/main/resources/specs/0002-flat-direct-storage-layout.md @@ -0,0 +1,69 @@ +# OCFL Community Extension 0002: Flat Direct Storage Layout + +* **Extension Name:** 0002-flat-direct-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes a simple flat OCFL storage layout. OCFL object identifiers are mapped directly to directory names that are direct children of the OCFL storage root directory. + +The limitations of this layout are filesystem dependent, but are generally as follows: + +* The size of object IDs cannot exceed the maximum allowed directory name size (eg. 255 characters) +* Object IDs cannot include characters that are illegal in directory names +* Performance may degrade as the size of a repository increases because every object is a direct child of the storage root + +## Parameters + +This extension has no parameters. + +## Procedure + +The OCFL object identifier is used, without any changes, as the object's root path within the OCFL storage root. + +## Examples + +### Example 1 + +This example demonstrates some mappings that produce directory names that are valid on unix filesystems. + +#### Mappings + +| Object ID | Object Root Path | +| --- | --- | +| object-01 | `object-01` | +| ..hor\_rib:lé-$id | `..hor_rib:lé-$id` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── object-01/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ..hor_rib:lé-$id/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates some mappings that produce directory names that are invalid on unix filesystems; therefore this layout cannot be used in a repository that needs to be able to store objects with IDs like these. + +#### Mappings + +| Object ID | Object Root Path | +| --- | --- | +| info:fedora/object-01 | `info:fedora/object-01` | +| abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij | `abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij` | + diff --git a/ocfl-java-core/src/main/resources/specs/0003-hash-and-id-n-tuple-storage-layout.md b/ocfl-java-core/src/main/resources/specs/0003-hash-and-id-n-tuple-storage-layout.md new file mode 100644 index 00000000..85095f75 --- /dev/null +++ b/ocfl-java-core/src/main/resources/specs/0003-hash-and-id-n-tuple-storage-layout.md @@ -0,0 +1,380 @@ +# OCFL Community Extension 0003: Hashed Truncated N-tuple Trees with Object ID Encapsulating Directory for OCFL Storage Hierarchies + +* **Extension Name:** 0003-hash-and-id-n-tuple-storage-layout +* **Authors:** Ben Cail +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters, to OCFL object root directories. + +Using this extension, OCFL object identifiers are hashed and encoded as hex +strings (all letters lower-case). These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL storage +root. Finally, the OCFL object identifier is percent-encoded to create a +directory name for the OCFL object root (see ["Encapsulation +Directory"](#encapsulation-directory) section below). + +The n-tuple segments approach allows OCFL object identifiers to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. The encoded encapsulation directory name provides visibility into +the object identifier from the file path (see ["Encapsulation +Directory"](#encapsulation-directory) section below for details). + +## Encapsulation Directory + +For basic OCFL object identifiers, the object identifier is used as the name of +the encapsulation directory (ie. the object root directory). + +Some object identifiers could contain characters that are not safe for directory +names on all filesystems. Safe characters are defined as A-Z, a-z, 0-9, '-' and +'\_'. When an unsafe character is encountered in an object identifier, it is +percent-encoded using the lower-case hex characters of its UTF-8 encoding. + +Some object identifiers could also result in an encoded string that is longer +than can be supported as a directory name. To handle that scenario, if the +percent-encoded object identifier is longer than 100 characters, it is truncated +to 100 characters, and then the digest of the original object identifier is +appended to the encoded object identifier like this: +\-\. Note: this means that it +is no longer possible to determine the full object identifier from the +encapsulation directory name - some characters have been removed, and even the +first 100 characters of the encoded object identifier cannot be fully, reliably +decoded, because the truncation may leave a partial encoding at the end of the +100 characters. + +| Object ID | Encapsulation Directory Name | +| --- | --- | +| object-01 | object-01 | +| ..Hor/rib:lè-$id | %2e%2eHor%2frib%3al%c3%a8-%24id | +| abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghija | abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij-5cc73e648fbcff136510e330871180922ddacf193b68fdeff855683a01464220 | + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply to the OCFL object identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the size of the segments (in characters) that the digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates how many segments are used for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in each tuple. +The tuples are used as directory names. The default value is `3`, which means +that each intermediate directory in the OCFL storage hierarchy could contain up +to 4096 directories. Increasing this value increases the maximum number of +sub-directories per directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +## Procedure + +The following is an outline of the steps to follow to map an OCFL object +identifier to an OCFL object root path using this extension (also see the +["Python Code"](#python-code) section): + +1. The OCFL object identifier is encoded as UTF-8 and hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lower-case hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. The OCFL object identifier is percent-encoded to create the encapsulation + directory name (see ["Encapsulation Directory"](#encapsulation-directory) + section above for details). +6. The encapsulation directory name is joined to the end of the path. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +this extension's default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0003-hash-and-id-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3 +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --- | --- | --- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/object-01` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/%2e%2ehor%2frib%3ale-%24id` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/0003-hash-and-id-n-tuple-storage-layout/config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── object-01/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── %2e%2ehor%2frib%3ale-%24id/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgorithm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0003-hash-and-id-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15 +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --- | --- | --- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/object-01` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/%2e%2ehor%2frib%3ale-%24id` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/0003-hash-and-id-n-tuple-storage-layout/config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── %2e%2ehor%2frib%3ale-%24id/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── object-01/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0003-hash-and-id-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0 +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --- | --- | --- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `object-id` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `%2e%2ehor%2frib%3ale-%24id` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/0003-hash-and-id-n-tuple-storage-layout/config.json +├── object-id/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── %2e%2ehor%2frib%3ale-%24id/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +## Python Code + +Here is some python code that implements the algorithm: + +``` +import codecs +import hashlib +import os +import re + + +def _percent_encode(c): + c_bytes = c.encode('utf8') + s = '' + for b in c_bytes: + s += '%' + codecs.encode(bytes([b]), encoding='hex_codec').decode('utf8') + return s + + +def _get_encapsulation_directory(object_id, digest): + d = '' + for c in object_id: + if re.match(r'[A-Za-z0-9-_]{1}', c): + d += c + else: + d += _percent_encode(c) + if len(d) > 100: + return f'{d[:100]}-{digest}' + return d + + +def ocfl_path(object_id, algorithm='sha256', tuple_size=3, number_of_tuples=3): + object_id_utf8 = object_id.encode('utf8') + if algorithm == 'md5': + digest = hashlib.md5(object_id_utf8).hexdigest() + elif algorithm == 'sha256': + digest = hashlib.sha256(object_id_utf8).hexdigest() + elif algorithm == 'sha512': + digest = hashlib.sha512(object_id_utf8).hexdigest() + digest = digest.lower() + path = '' + for i in range(number_of_tuples): + part = digest[i*tuple_size:i*tuple_size+tuple_size] + path = os.path.join(path, part) + encapsulation_directory = _get_encapsulation_directory(object_id, digest=digest) + path = os.path.join(path, encapsulation_directory) + return path + + +def _check_path(object_id, correct_path, algorithm='sha256', tuple_size=3, number_of_tuples=3): + p = ocfl_path(object_id, algorithm=algorithm, tuple_size=tuple_size, number_of_tuples=number_of_tuples) + assert p == correct_path, f'{p} != {correct_path}' + print(f' "{object_id}" {algorithm} => {p}') + + +def run_tests(): + print('running tests...') + assert _percent_encode('.') == '%2e' + assert _percent_encode('ç') == '%c3%a7' + _check_path(object_id='object-01', correct_path='3c0/ff4/240/object-01') + _check_path(object_id='object-01', correct_path='ff7/553/449/object-01', algorithm='md5') + _check_path(object_id='object-01', correct_path='ff755/34492/object-01', algorithm='md5', tuple_size=5, number_of_tuples=2) + _check_path(object_id='object-01', correct_path='object-01', algorithm='md5', tuple_size=0, number_of_tuples=0) + _check_path(object_id='object-01', correct_path='ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/object-01', algorithm='md5', tuple_size=2, number_of_tuples=15) + _check_path(object_id='..hor/rib:le-$id', correct_path='487/326/d8c/%2e%2ehor%2frib%3ale-%24id') + _check_path(object_id='..hor/rib:le-$id', correct_path='083/197/66f/%2e%2ehor%2frib%3ale-%24id', algorithm='md5') #08319766fb6c2935dd175b94267717e0 + _check_path(object_id='..Hor/rib:lè-$id', correct_path='373/529/21a/%2e%2eHor%2frib%3al%c3%a8-%24id') + long_object_id = 'abcdefghij' * 26 + long_object_id_digest = '55b432806f4e270da0cf23815ed338742179002153cd8d896f23b3e2d8a14359' + _check_path(object_id=long_object_id, correct_path=f'55b/432/806/abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij-{long_object_id_digest}') + long_object_id_101 = 'abcdefghij' * 10 + 'a' + long_object_id_101_digest = '5cc73e648fbcff136510e330871180922ddacf193b68fdeff855683a01464220' + _check_path(object_id=long_object_id_101, correct_path=f'5cc/73e/648/abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij-{long_object_id_101_digest}') + + +if __name__ == '__main__': + run_tests() +``` diff --git a/ocfl-java-core/src/main/resources/specs/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-core/src/main/resources/specs/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-core/src/main/resources/specs/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-core/src/main/resources/ocfl_1.0.txt b/ocfl-java-core/src/main/resources/specs/ocfl_1.0.txt similarity index 100% rename from ocfl-java-core/src/main/resources/ocfl_1.0.txt rename to ocfl-java-core/src/main/resources/specs/ocfl_1.0.txt diff --git a/ocfl-java-core/src/test/java/edu/wisc/library/ocfl/core/storage/filesystem/FileSystemOcflStorageInitializerTest.java b/ocfl-java-core/src/test/java/edu/wisc/library/ocfl/core/storage/filesystem/FileSystemOcflStorageInitializerTest.java index 9c543111..c0d21ba3 100644 --- a/ocfl-java-core/src/test/java/edu/wisc/library/ocfl/core/storage/filesystem/FileSystemOcflStorageInitializerTest.java +++ b/ocfl-java-core/src/test/java/edu/wisc/library/ocfl/core/storage/filesystem/FileSystemOcflStorageInitializerTest.java @@ -145,6 +145,7 @@ private void assertRootHasFiles(Path root) { aFileNamed(equalTo("0=ocfl_1.0")), aFileNamed(equalTo("ocfl_1.0.txt")), aFileNamed(equalTo(OcflConstants.OCFL_LAYOUT)), + aFileNamed(equalTo(HashedNTupleLayoutExtension.EXTENSION_NAME + ".md")), aFileNamed(equalTo(OcflConstants.EXTENSIONS_DIR)))); assertThat(root.resolve(OcflConstants.EXTENSIONS_DIR) .resolve(HashedNTupleLayoutExtension.EXTENSION_NAME) diff --git a/ocfl-java-itest/src/test/java/edu/wisc/library/ocfl/itest/MutableHeadITest.java b/ocfl-java-itest/src/test/java/edu/wisc/library/ocfl/itest/MutableHeadITest.java index b2a2a177..a14b8008 100644 --- a/ocfl-java-itest/src/test/java/edu/wisc/library/ocfl/itest/MutableHeadITest.java +++ b/ocfl-java-itest/src/test/java/edu/wisc/library/ocfl/itest/MutableHeadITest.java @@ -266,7 +266,7 @@ public void shouldFailWhenRevisionMarkerAlreadyExists() { OcflAsserts.assertThrowsWithMessage(ObjectOutOfSyncException.class, "Changes are out of sync with the current object state", () -> { repo.stageChanges(ObjectVersionId.head(objectId), defaultVersionInfo.setMessage("stage 2"), updater -> { - writeFile(repoName, O1_PATH + "/extensions/0004-mutable-head/revisions/r2", TestHelper.inputStream("r2")); + writeFile(repoName, O1_PATH + "/extensions/0005-mutable-head/revisions/r2", TestHelper.inputStream("r2")); updater.writeFile(new ByteArrayInputStream("file5" .getBytes()), "file5") .renameFile("dir1/file3", "file3") .removeFile("dir1/file4"); @@ -416,7 +416,7 @@ public void rejectImportObjectWhenWithMutableHeadWhenInvalidMutableHead() throws repo1.exportObject(objectId, output); - Files.delete(output.resolve("extensions/0004-mutable-head/head/content/r1/dir1/file3")); + Files.delete(output.resolve("extensions/0005-mutable-head/head/content/r1/dir1/file3")); var repoName2 = "mutable-import"; var repo2 = defaultRepo(repoName2); diff --git a/ocfl-java-itest/src/test/java/edu/wisc/library/ocfl/itest/OcflITest.java b/ocfl-java-itest/src/test/java/edu/wisc/library/ocfl/itest/OcflITest.java index 3910a6d7..3b4fc0b5 100644 --- a/ocfl-java-itest/src/test/java/edu/wisc/library/ocfl/itest/OcflITest.java +++ b/ocfl-java-itest/src/test/java/edu/wisc/library/ocfl/itest/OcflITest.java @@ -1119,7 +1119,7 @@ public void purgeObjectWhenExists() { repo.describeObject(objectId); }); - assertEquals(4, listFilesInRepo(repoName).size()); + assertEquals(5, listFilesInRepo(repoName).size()); } @Test @@ -1139,7 +1139,7 @@ public void purgeObjectDoNothingWhenDoesNotExist() { repo.describeObject("o4"); }); - assertEquals(11, listFilesInRepo(repoName).size()); + assertEquals(12, listFilesInRepo(repoName).size()); } @Test diff --git a/ocfl-java-itest/src/test/java/edu/wisc/library/ocfl/itest/filesystem/FileSystemOcflITest.java b/ocfl-java-itest/src/test/java/edu/wisc/library/ocfl/itest/filesystem/FileSystemOcflITest.java index 47504bea..7b186bb2 100644 --- a/ocfl-java-itest/src/test/java/edu/wisc/library/ocfl/itest/filesystem/FileSystemOcflITest.java +++ b/ocfl-java-itest/src/test/java/edu/wisc/library/ocfl/itest/filesystem/FileSystemOcflITest.java @@ -5,6 +5,7 @@ import edu.wisc.library.ocfl.api.model.ObjectVersionId; import edu.wisc.library.ocfl.core.OcflRepositoryBuilder; import edu.wisc.library.ocfl.core.cache.NoOpCache; +import edu.wisc.library.ocfl.core.extension.storage.layout.HashedNTupleLayoutExtension; import edu.wisc.library.ocfl.core.extension.storage.layout.config.HashedNTupleIdEncapsulationLayoutConfig; import edu.wisc.library.ocfl.core.extension.storage.layout.config.HashedNTupleLayoutConfig; import edu.wisc.library.ocfl.core.util.FileUtil; @@ -86,7 +87,9 @@ public void purgeShouldRemoveEmptyParentDirs() throws IOException { repo.purgeObject(objectId); assertThat(Arrays.asList(repoDir(repoName).toFile().list()).stream().collect(Collectors.toList()), - containsInAnyOrder("0=ocfl_1.0", "ocfl_1.0.txt", OcflConstants.EXTENSIONS_DIR, OcflConstants.OCFL_LAYOUT)); + containsInAnyOrder("0=ocfl_1.0", "ocfl_1.0.txt", + OcflConstants.EXTENSIONS_DIR, OcflConstants.OCFL_LAYOUT, + HashedNTupleLayoutExtension.EXTENSION_NAME + ".md")); } @Override diff --git a/ocfl-java-itest/src/test/resources/expected/repos/different-content/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/different-content/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/different-content/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/different-digest/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/different-digest/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/different-digest/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/flat-layout/0002-flat-direct-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/flat-layout/0002-flat-direct-storage-layout.md new file mode 100644 index 00000000..5acd6a68 --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/flat-layout/0002-flat-direct-storage-layout.md @@ -0,0 +1,69 @@ +# OCFL Community Extension 0002: Flat Direct Storage Layout + +* **Extension Name:** 0002-flat-direct-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes a simple flat OCFL storage layout. OCFL object identifiers are mapped directly to directory names that are direct children of the OCFL storage root directory. + +The limitations of this layout are filesystem dependent, but are generally as follows: + +* The size of object IDs cannot exceed the maximum allowed directory name size (eg. 255 characters) +* Object IDs cannot include characters that are illegal in directory names +* Performance may degrade as the size of a repository increases because every object is a direct child of the storage root + +## Parameters + +This extension has no parameters. + +## Procedure + +The OCFL object identifier is used, without any changes, as the object's root path within the OCFL storage root. + +## Examples + +### Example 1 + +This example demonstrates some mappings that produce directory names that are valid on unix filesystems. + +#### Mappings + +| Object ID | Object Root Path | +| --- | --- | +| object-01 | `object-01` | +| ..hor\_rib:lé-$id | `..hor_rib:lé-$id` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── object-01/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ..hor_rib:lé-$id/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates some mappings that produce directory names that are invalid on unix filesystems; therefore this layout cannot be used in a repository that needs to be able to store objects with IDs like these. + +#### Mappings + +| Object ID | Object Root Path | +| --- | --- | +| info:fedora/object-01 | `info:fedora/object-01` | +| abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij | `abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij` | + diff --git a/ocfl-java-itest/src/test/resources/expected/repos/hashed-id-layout-2/0003-hash-and-id-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/hashed-id-layout-2/0003-hash-and-id-n-tuple-storage-layout.md new file mode 100644 index 00000000..85095f75 --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/hashed-id-layout-2/0003-hash-and-id-n-tuple-storage-layout.md @@ -0,0 +1,380 @@ +# OCFL Community Extension 0003: Hashed Truncated N-tuple Trees with Object ID Encapsulating Directory for OCFL Storage Hierarchies + +* **Extension Name:** 0003-hash-and-id-n-tuple-storage-layout +* **Authors:** Ben Cail +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters, to OCFL object root directories. + +Using this extension, OCFL object identifiers are hashed and encoded as hex +strings (all letters lower-case). These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL storage +root. Finally, the OCFL object identifier is percent-encoded to create a +directory name for the OCFL object root (see ["Encapsulation +Directory"](#encapsulation-directory) section below). + +The n-tuple segments approach allows OCFL object identifiers to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. The encoded encapsulation directory name provides visibility into +the object identifier from the file path (see ["Encapsulation +Directory"](#encapsulation-directory) section below for details). + +## Encapsulation Directory + +For basic OCFL object identifiers, the object identifier is used as the name of +the encapsulation directory (ie. the object root directory). + +Some object identifiers could contain characters that are not safe for directory +names on all filesystems. Safe characters are defined as A-Z, a-z, 0-9, '-' and +'\_'. When an unsafe character is encountered in an object identifier, it is +percent-encoded using the lower-case hex characters of its UTF-8 encoding. + +Some object identifiers could also result in an encoded string that is longer +than can be supported as a directory name. To handle that scenario, if the +percent-encoded object identifier is longer than 100 characters, it is truncated +to 100 characters, and then the digest of the original object identifier is +appended to the encoded object identifier like this: +\-\. Note: this means that it +is no longer possible to determine the full object identifier from the +encapsulation directory name - some characters have been removed, and even the +first 100 characters of the encoded object identifier cannot be fully, reliably +decoded, because the truncation may leave a partial encoding at the end of the +100 characters. + +| Object ID | Encapsulation Directory Name | +| --- | --- | +| object-01 | object-01 | +| ..Hor/rib:lè-$id | %2e%2eHor%2frib%3al%c3%a8-%24id | +| abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghija | abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij-5cc73e648fbcff136510e330871180922ddacf193b68fdeff855683a01464220 | + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply to the OCFL object identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the size of the segments (in characters) that the digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates how many segments are used for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in each tuple. +The tuples are used as directory names. The default value is `3`, which means +that each intermediate directory in the OCFL storage hierarchy could contain up +to 4096 directories. Increasing this value increases the maximum number of +sub-directories per directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +## Procedure + +The following is an outline of the steps to follow to map an OCFL object +identifier to an OCFL object root path using this extension (also see the +["Python Code"](#python-code) section): + +1. The OCFL object identifier is encoded as UTF-8 and hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lower-case hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. The OCFL object identifier is percent-encoded to create the encapsulation + directory name (see ["Encapsulation Directory"](#encapsulation-directory) + section above for details). +6. The encapsulation directory name is joined to the end of the path. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +this extension's default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0003-hash-and-id-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3 +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --- | --- | --- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/object-01` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/%2e%2ehor%2frib%3ale-%24id` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/0003-hash-and-id-n-tuple-storage-layout/config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── object-01/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── %2e%2ehor%2frib%3ale-%24id/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgorithm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0003-hash-and-id-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15 +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --- | --- | --- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/object-01` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/%2e%2ehor%2frib%3ale-%24id` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/0003-hash-and-id-n-tuple-storage-layout/config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── %2e%2ehor%2frib%3ale-%24id/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── object-01/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0003-hash-and-id-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0 +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --- | --- | --- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `object-id` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `%2e%2ehor%2frib%3ale-%24id` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/0003-hash-and-id-n-tuple-storage-layout/config.json +├── object-id/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── %2e%2ehor%2frib%3ale-%24id/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +## Python Code + +Here is some python code that implements the algorithm: + +``` +import codecs +import hashlib +import os +import re + + +def _percent_encode(c): + c_bytes = c.encode('utf8') + s = '' + for b in c_bytes: + s += '%' + codecs.encode(bytes([b]), encoding='hex_codec').decode('utf8') + return s + + +def _get_encapsulation_directory(object_id, digest): + d = '' + for c in object_id: + if re.match(r'[A-Za-z0-9-_]{1}', c): + d += c + else: + d += _percent_encode(c) + if len(d) > 100: + return f'{d[:100]}-{digest}' + return d + + +def ocfl_path(object_id, algorithm='sha256', tuple_size=3, number_of_tuples=3): + object_id_utf8 = object_id.encode('utf8') + if algorithm == 'md5': + digest = hashlib.md5(object_id_utf8).hexdigest() + elif algorithm == 'sha256': + digest = hashlib.sha256(object_id_utf8).hexdigest() + elif algorithm == 'sha512': + digest = hashlib.sha512(object_id_utf8).hexdigest() + digest = digest.lower() + path = '' + for i in range(number_of_tuples): + part = digest[i*tuple_size:i*tuple_size+tuple_size] + path = os.path.join(path, part) + encapsulation_directory = _get_encapsulation_directory(object_id, digest=digest) + path = os.path.join(path, encapsulation_directory) + return path + + +def _check_path(object_id, correct_path, algorithm='sha256', tuple_size=3, number_of_tuples=3): + p = ocfl_path(object_id, algorithm=algorithm, tuple_size=tuple_size, number_of_tuples=number_of_tuples) + assert p == correct_path, f'{p} != {correct_path}' + print(f' "{object_id}" {algorithm} => {p}') + + +def run_tests(): + print('running tests...') + assert _percent_encode('.') == '%2e' + assert _percent_encode('ç') == '%c3%a7' + _check_path(object_id='object-01', correct_path='3c0/ff4/240/object-01') + _check_path(object_id='object-01', correct_path='ff7/553/449/object-01', algorithm='md5') + _check_path(object_id='object-01', correct_path='ff755/34492/object-01', algorithm='md5', tuple_size=5, number_of_tuples=2) + _check_path(object_id='object-01', correct_path='object-01', algorithm='md5', tuple_size=0, number_of_tuples=0) + _check_path(object_id='object-01', correct_path='ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/object-01', algorithm='md5', tuple_size=2, number_of_tuples=15) + _check_path(object_id='..hor/rib:le-$id', correct_path='487/326/d8c/%2e%2ehor%2frib%3ale-%24id') + _check_path(object_id='..hor/rib:le-$id', correct_path='083/197/66f/%2e%2ehor%2frib%3ale-%24id', algorithm='md5') #08319766fb6c2935dd175b94267717e0 + _check_path(object_id='..Hor/rib:lè-$id', correct_path='373/529/21a/%2e%2eHor%2frib%3al%c3%a8-%24id') + long_object_id = 'abcdefghij' * 26 + long_object_id_digest = '55b432806f4e270da0cf23815ed338742179002153cd8d896f23b3e2d8a14359' + _check_path(object_id=long_object_id, correct_path=f'55b/432/806/abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij-{long_object_id_digest}') + long_object_id_101 = 'abcdefghij' * 10 + 'a' + long_object_id_101_digest = '5cc73e648fbcff136510e330871180922ddacf193b68fdeff855683a01464220' + _check_path(object_id=long_object_id_101, correct_path=f'5cc/73e/648/abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij-{long_object_id_101_digest}') + + +if __name__ == '__main__': + run_tests() +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/hashed-id-layout/0003-hash-and-id-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/hashed-id-layout/0003-hash-and-id-n-tuple-storage-layout.md new file mode 100644 index 00000000..85095f75 --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/hashed-id-layout/0003-hash-and-id-n-tuple-storage-layout.md @@ -0,0 +1,380 @@ +# OCFL Community Extension 0003: Hashed Truncated N-tuple Trees with Object ID Encapsulating Directory for OCFL Storage Hierarchies + +* **Extension Name:** 0003-hash-and-id-n-tuple-storage-layout +* **Authors:** Ben Cail +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters, to OCFL object root directories. + +Using this extension, OCFL object identifiers are hashed and encoded as hex +strings (all letters lower-case). These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL storage +root. Finally, the OCFL object identifier is percent-encoded to create a +directory name for the OCFL object root (see ["Encapsulation +Directory"](#encapsulation-directory) section below). + +The n-tuple segments approach allows OCFL object identifiers to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. The encoded encapsulation directory name provides visibility into +the object identifier from the file path (see ["Encapsulation +Directory"](#encapsulation-directory) section below for details). + +## Encapsulation Directory + +For basic OCFL object identifiers, the object identifier is used as the name of +the encapsulation directory (ie. the object root directory). + +Some object identifiers could contain characters that are not safe for directory +names on all filesystems. Safe characters are defined as A-Z, a-z, 0-9, '-' and +'\_'. When an unsafe character is encountered in an object identifier, it is +percent-encoded using the lower-case hex characters of its UTF-8 encoding. + +Some object identifiers could also result in an encoded string that is longer +than can be supported as a directory name. To handle that scenario, if the +percent-encoded object identifier is longer than 100 characters, it is truncated +to 100 characters, and then the digest of the original object identifier is +appended to the encoded object identifier like this: +\-\. Note: this means that it +is no longer possible to determine the full object identifier from the +encapsulation directory name - some characters have been removed, and even the +first 100 characters of the encoded object identifier cannot be fully, reliably +decoded, because the truncation may leave a partial encoding at the end of the +100 characters. + +| Object ID | Encapsulation Directory Name | +| --- | --- | +| object-01 | object-01 | +| ..Hor/rib:lè-$id | %2e%2eHor%2frib%3al%c3%a8-%24id | +| abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghija | abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij-5cc73e648fbcff136510e330871180922ddacf193b68fdeff855683a01464220 | + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply to the OCFL object identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the size of the segments (in characters) that the digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates how many segments are used for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in each tuple. +The tuples are used as directory names. The default value is `3`, which means +that each intermediate directory in the OCFL storage hierarchy could contain up +to 4096 directories. Increasing this value increases the maximum number of +sub-directories per directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +## Procedure + +The following is an outline of the steps to follow to map an OCFL object +identifier to an OCFL object root path using this extension (also see the +["Python Code"](#python-code) section): + +1. The OCFL object identifier is encoded as UTF-8 and hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lower-case hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. The OCFL object identifier is percent-encoded to create the encapsulation + directory name (see ["Encapsulation Directory"](#encapsulation-directory) + section above for details). +6. The encapsulation directory name is joined to the end of the path. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +this extension's default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0003-hash-and-id-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3 +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --- | --- | --- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/object-01` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/%2e%2ehor%2frib%3ale-%24id` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/0003-hash-and-id-n-tuple-storage-layout/config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── object-01/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── %2e%2ehor%2frib%3ale-%24id/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgorithm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0003-hash-and-id-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15 +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --- | --- | --- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/object-01` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/%2e%2ehor%2frib%3ale-%24id` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/0003-hash-and-id-n-tuple-storage-layout/config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── %2e%2ehor%2frib%3ale-%24id/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── object-01/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0003-hash-and-id-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0 +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --- | --- | --- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `object-id` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `%2e%2ehor%2frib%3ale-%24id` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/0003-hash-and-id-n-tuple-storage-layout/config.json +├── object-id/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── %2e%2ehor%2frib%3ale-%24id/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +## Python Code + +Here is some python code that implements the algorithm: + +``` +import codecs +import hashlib +import os +import re + + +def _percent_encode(c): + c_bytes = c.encode('utf8') + s = '' + for b in c_bytes: + s += '%' + codecs.encode(bytes([b]), encoding='hex_codec').decode('utf8') + return s + + +def _get_encapsulation_directory(object_id, digest): + d = '' + for c in object_id: + if re.match(r'[A-Za-z0-9-_]{1}', c): + d += c + else: + d += _percent_encode(c) + if len(d) > 100: + return f'{d[:100]}-{digest}' + return d + + +def ocfl_path(object_id, algorithm='sha256', tuple_size=3, number_of_tuples=3): + object_id_utf8 = object_id.encode('utf8') + if algorithm == 'md5': + digest = hashlib.md5(object_id_utf8).hexdigest() + elif algorithm == 'sha256': + digest = hashlib.sha256(object_id_utf8).hexdigest() + elif algorithm == 'sha512': + digest = hashlib.sha512(object_id_utf8).hexdigest() + digest = digest.lower() + path = '' + for i in range(number_of_tuples): + part = digest[i*tuple_size:i*tuple_size+tuple_size] + path = os.path.join(path, part) + encapsulation_directory = _get_encapsulation_directory(object_id, digest=digest) + path = os.path.join(path, encapsulation_directory) + return path + + +def _check_path(object_id, correct_path, algorithm='sha256', tuple_size=3, number_of_tuples=3): + p = ocfl_path(object_id, algorithm=algorithm, tuple_size=tuple_size, number_of_tuples=number_of_tuples) + assert p == correct_path, f'{p} != {correct_path}' + print(f' "{object_id}" {algorithm} => {p}') + + +def run_tests(): + print('running tests...') + assert _percent_encode('.') == '%2e' + assert _percent_encode('ç') == '%c3%a7' + _check_path(object_id='object-01', correct_path='3c0/ff4/240/object-01') + _check_path(object_id='object-01', correct_path='ff7/553/449/object-01', algorithm='md5') + _check_path(object_id='object-01', correct_path='ff755/34492/object-01', algorithm='md5', tuple_size=5, number_of_tuples=2) + _check_path(object_id='object-01', correct_path='object-01', algorithm='md5', tuple_size=0, number_of_tuples=0) + _check_path(object_id='object-01', correct_path='ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/object-01', algorithm='md5', tuple_size=2, number_of_tuples=15) + _check_path(object_id='..hor/rib:le-$id', correct_path='487/326/d8c/%2e%2ehor%2frib%3ale-%24id') + _check_path(object_id='..hor/rib:le-$id', correct_path='083/197/66f/%2e%2ehor%2frib%3ale-%24id', algorithm='md5') #08319766fb6c2935dd175b94267717e0 + _check_path(object_id='..Hor/rib:lè-$id', correct_path='373/529/21a/%2e%2eHor%2frib%3al%c3%a8-%24id') + long_object_id = 'abcdefghij' * 26 + long_object_id_digest = '55b432806f4e270da0cf23815ed338742179002153cd8d896f23b3e2d8a14359' + _check_path(object_id=long_object_id, correct_path=f'55b/432/806/abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij-{long_object_id_digest}') + long_object_id_101 = 'abcdefghij' * 10 + 'a' + long_object_id_101_digest = '5cc73e648fbcff136510e330871180922ddacf193b68fdeff855683a01464220' + _check_path(object_id=long_object_id_101, correct_path=f'5cc/73e/648/abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij-{long_object_id_101_digest}') + + +if __name__ == '__main__': + run_tests() +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable1/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/mutable1/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/mutable1/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json.sha512 b/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json.sha512 deleted file mode 100644 index ec279be0..00000000 --- a/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json.sha512 +++ /dev/null @@ -1 +0,0 @@ -9eedac2a1364d0551b5131f589e5924c5e0ca2af04d8dd4f944d914c6cb5e5dd9a1b2351584716ee2b074122ee4bc1694c8429ef7bfe16bb5cdae054432a45be inventory.json \ No newline at end of file diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/content/r1/dir1/file3 b/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/content/r1/dir1/file3 similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/content/r1/dir1/file3 rename to ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/content/r1/dir1/file3 diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/content/r2/file5 b/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/content/r2/file5 similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/content/r2/file5 rename to ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/content/r2/file5 diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json b/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json similarity index 94% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json rename to ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json index 5e5e5816..fe087d68 100644 --- a/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json +++ b/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json @@ -7,9 +7,9 @@ "fixity" : { }, "manifest" : { "4cf0ff5673ec65d9900df95502ed92b2605fc602ca20b6901652c7561b302668026095813af6adb0e663bdcdbe1f276d18bf0de254992a78573ad6574e7ae1f6" : [ "v1/content/file2" ], - "891ba5a657177c5f94b5179fe23e31d0c05dd7506bf22d24b280585cdb052aefe5d7336ac7ba697215d6d45a09cbaa7b845d576fee6626fc5fd303b0efe2cc9d" : [ "extensions/0004-mutable-head/head/content/r2/file5" ], + "891ba5a657177c5f94b5179fe23e31d0c05dd7506bf22d24b280585cdb052aefe5d7336ac7ba697215d6d45a09cbaa7b845d576fee6626fc5fd303b0efe2cc9d" : [ "extensions/0005-mutable-head/head/content/r2/file5" ], "96a26e7629b55187f9ba3edc4acc940495d582093b8a88cb1f0303cf3399fe6b1f5283d76dfd561fc401a0cdf878c5aad9f2d6e7e2d9ceee678757bb5d95c39e" : [ "v1/content/file1" ], - "b10ff867df18165a0e100d99cd3d27f845f7ef9ad84eeb627a53aabaea04805940c3693154b8a32541a31887dda9fb1e667e93307473b1c581021714768bd032" : [ "extensions/0004-mutable-head/head/content/r1/dir1/file3" ] + "b10ff867df18165a0e100d99cd3d27f845f7ef9ad84eeb627a53aabaea04805940c3693154b8a32541a31887dda9fb1e667e93307473b1c581021714768bd032" : [ "extensions/0005-mutable-head/head/content/r1/dir1/file3" ] }, "versions" : { "v1" : { diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json.sha512 b/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json.sha512 new file mode 100644 index 00000000..f093cb0e --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json.sha512 @@ -0,0 +1 @@ +89439a330e797cdb00a88b5aa178df714bc2465b71e4399160f4d633050ebbcdbf724c843231bfa1e67cddc0c82c41ce81b67c6fbc8b23723191a06a3c9b4ed3 inventory.json \ No newline at end of file diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/revisions/r1 b/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/revisions/r1 similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/revisions/r1 rename to ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/revisions/r1 diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/revisions/r2 b/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/revisions/r2 similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/revisions/r2 rename to ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/revisions/r2 diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/root-inventory.json.sha512 b/ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/root-inventory.json.sha512 similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/root-inventory.json.sha512 rename to ocfl-java-itest/src/test/resources/expected/repos/mutable1/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/root-inventory.json.sha512 diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable2/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/mutable2/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/mutable2/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json.sha512 b/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json.sha512 deleted file mode 100644 index af515ad8..00000000 --- a/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json.sha512 +++ /dev/null @@ -1 +0,0 @@ -3d4e2a70bf5aa943134e0b0a92674e3315da26fecb7499438399e35cdd3f61c20544b937ebbbaf3ed3c04c7bd17345ed300dc0a1b08e0e2b3ba6c9979193182e inventory.json \ No newline at end of file diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/content/r1/dir1/.gitkeep b/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/content/r1/dir1/.gitkeep similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/content/r1/dir1/.gitkeep rename to ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/content/r1/dir1/.gitkeep diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/content/r2/file4 b/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/content/r2/file4 similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/content/r2/file4 rename to ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/content/r2/file4 diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json b/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json similarity index 93% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json rename to ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json index 7d2b5e02..ebecb511 100644 --- a/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json +++ b/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json @@ -6,7 +6,7 @@ "contentDirectory" : "content", "fixity" : { }, "manifest" : { - "3eb3222d4b5629a27dede47a90c7665fcaff1305c2c6c645465edc8278768032ca223f9193f3e9d6bb7a8b950ed284030d256ce1ee33e1de7e9f3e2cb1a834a0" : [ "extensions/0004-mutable-head/head/content/r2/file4" ] + "3eb3222d4b5629a27dede47a90c7665fcaff1305c2c6c645465edc8278768032ca223f9193f3e9d6bb7a8b950ed284030d256ce1ee33e1de7e9f3e2cb1a834a0" : [ "extensions/0005-mutable-head/head/content/r2/file4" ] }, "versions" : { "v1" : { diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json.sha512 b/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json.sha512 new file mode 100644 index 00000000..06c3935b --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json.sha512 @@ -0,0 +1 @@ +8904aa415706c76e7ecc03f26db4b57e6fb02b3edd9ce5d2d9bb543ca23b1ac02e92f3d5c9ddac0117f003fc2cee5a66482e9916fe99f5dfd6adbaefeaaf5789 inventory.json \ No newline at end of file diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/revisions/r1 b/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/revisions/r1 similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/revisions/r1 rename to ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/revisions/r1 diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/revisions/r2 b/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/revisions/r2 similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/revisions/r2 rename to ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/revisions/r2 diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/root-inventory.json.sha512 b/ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/root-inventory.json.sha512 similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/root-inventory.json.sha512 rename to ocfl-java-itest/src/test/resources/expected/repos/mutable2/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/root-inventory.json.sha512 diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable3/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/mutable3/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/mutable3/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable4/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/mutable4/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/mutable4/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable5/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/mutable5/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/mutable5/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json.sha512 b/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json.sha512 deleted file mode 100644 index 382f80e9..00000000 --- a/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json.sha512 +++ /dev/null @@ -1 +0,0 @@ -df4419d36a854205426d174db5b1fcf53ab1df4a55b42f7696bf93ae1c91ef6c76252acb57969ef2d6eb57587ffc10cdaeb8402ad5963b82aca1e79c19246842 inventory.json \ No newline at end of file diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/content/r1/dir1/file3 b/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/content/r1/dir1/file3 similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/content/r1/dir1/file3 rename to ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/content/r1/dir1/file3 diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json b/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json similarity index 96% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json rename to ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json index e4a99f10..89972b3e 100644 --- a/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json +++ b/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json @@ -8,7 +8,7 @@ "manifest" : { "4cf0ff5673ec65d9900df95502ed92b2605fc602ca20b6901652c7561b302668026095813af6adb0e663bdcdbe1f276d18bf0de254992a78573ad6574e7ae1f6" : [ "v1/content/file2" ], "96a26e7629b55187f9ba3edc4acc940495d582093b8a88cb1f0303cf3399fe6b1f5283d76dfd561fc401a0cdf878c5aad9f2d6e7e2d9ceee678757bb5d95c39e" : [ "v1/content/file1" ], - "b10ff867df18165a0e100d99cd3d27f845f7ef9ad84eeb627a53aabaea04805940c3693154b8a32541a31887dda9fb1e667e93307473b1c581021714768bd032" : [ "extensions/0004-mutable-head/head/content/r1/dir1/file3" ] + "b10ff867df18165a0e100d99cd3d27f845f7ef9ad84eeb627a53aabaea04805940c3693154b8a32541a31887dda9fb1e667e93307473b1c581021714768bd032" : [ "extensions/0005-mutable-head/head/content/r1/dir1/file3" ] }, "versions" : { "v1" : { diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json.sha512 b/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json.sha512 new file mode 100644 index 00000000..016c9470 --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json.sha512 @@ -0,0 +1 @@ +467c811380f323f62ed292bd715f05d77973d95062d5c8e95157c1656c7e74fb8852d918526c50003cfceacac58b347063226847d0e7800c3f01d8f46c29938e inventory.json \ No newline at end of file diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/revisions/r1 b/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/revisions/r1 similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/revisions/r1 rename to ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/revisions/r1 diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/revisions/r2 b/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/revisions/r2 similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/revisions/r2 rename to ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/revisions/r2 diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/root-inventory.json.sha512 b/ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/root-inventory.json.sha512 similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/root-inventory.json.sha512 rename to ocfl-java-itest/src/test/resources/expected/repos/mutable5/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/root-inventory.json.sha512 diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable6/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/mutable6/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/mutable6/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json.sha512 b/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json.sha512 deleted file mode 100644 index a4929340..00000000 --- a/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json.sha512 +++ /dev/null @@ -1 +0,0 @@ -70d904d6047a8ab838b4c9716b45b791250174c8a05a7bbab5698fc3188603f7310d4c9fb09a340753d0cd58d9420ea5a12f1dff0bdd388e3c4649b5e54fe51c inventory.json \ No newline at end of file diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/content/r1/dir1/file3 b/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/content/r1/dir1/file3 similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/content/r1/dir1/file3 rename to ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/content/r1/dir1/file3 diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json b/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json similarity index 96% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json rename to ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json index b228e8f3..b3d3398a 100644 --- a/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/head/inventory.json +++ b/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json @@ -8,7 +8,7 @@ "manifest" : { "4cf0ff5673ec65d9900df95502ed92b2605fc602ca20b6901652c7561b302668026095813af6adb0e663bdcdbe1f276d18bf0de254992a78573ad6574e7ae1f6" : [ "v1/content/file2" ], "96a26e7629b55187f9ba3edc4acc940495d582093b8a88cb1f0303cf3399fe6b1f5283d76dfd561fc401a0cdf878c5aad9f2d6e7e2d9ceee678757bb5d95c39e" : [ "v1/content/file1" ], - "b10ff867df18165a0e100d99cd3d27f845f7ef9ad84eeb627a53aabaea04805940c3693154b8a32541a31887dda9fb1e667e93307473b1c581021714768bd032" : [ "extensions/0004-mutable-head/head/content/r1/dir1/file3" ] + "b10ff867df18165a0e100d99cd3d27f845f7ef9ad84eeb627a53aabaea04805940c3693154b8a32541a31887dda9fb1e667e93307473b1c581021714768bd032" : [ "extensions/0005-mutable-head/head/content/r1/dir1/file3" ] }, "versions" : { "v1" : { diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json.sha512 b/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json.sha512 new file mode 100644 index 00000000..1cf4aff6 --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/head/inventory.json.sha512 @@ -0,0 +1 @@ +0ac4800cf955a913680557a68457f55aeebcf5bb74e71bab112ea7d6dae82bdc1f3160f30435107a3d921617ce400ff248635e79ea1edd57f9b967d4725bd4bd inventory.json \ No newline at end of file diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/revisions/r1 b/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/revisions/r1 similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/revisions/r1 rename to ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/revisions/r1 diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/revisions/r2 b/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/revisions/r2 similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/revisions/r2 rename to ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/revisions/r2 diff --git a/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/root-inventory.json.sha512 b/ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/root-inventory.json.sha512 similarity index 100% rename from ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0004-mutable-head/root-inventory.json.sha512 rename to ocfl-java-itest/src/test/resources/expected/repos/mutable6/235/2da/728/2352da7280f1decc3acf1ba84eb945c9fc2b7b541094e1d0992dbffd1b6664cc/extensions/0005-mutable-head/root-inventory.json.sha512 diff --git a/ocfl-java-itest/src/test/resources/expected/repos/replicate1/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/replicate1/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/replicate1/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/replicate2/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/replicate2/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/replicate2/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/repo-multiple-objects/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/repo-multiple-objects/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/repo-multiple-objects/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/repo1/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/repo1/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/repo1/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/repo10/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/repo10/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/repo10/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/repo11/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/repo11/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/repo11/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/repo12/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/repo12/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/repo12/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/repo13/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/repo13/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/repo13/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/repo14/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/repo14/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/repo14/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/repo15/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/repo15/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/repo15/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/repo16/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/repo16/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/repo16/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/repo17/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/repo17/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/repo17/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/repo2/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/repo2/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/repo2/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/repo3/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/repo3/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/repo3/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/repo4/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/repo4/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/repo4/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/repo5/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/repo5/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/repo5/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/repo7/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/repo7/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/repo7/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/repo8/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/repo8/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/repo8/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/repo9/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/repo9/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/repo9/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/rollback1/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/rollback1/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/rollback1/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/rollback2/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/rollback2/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/rollback2/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/rollback3/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/rollback3/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/rollback3/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/windows-safe/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/windows-safe/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/windows-safe/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/expected/repos/zero-padded/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/expected/repos/zero-padded/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/expected/repos/zero-padded/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/invalid-repos/bad-logical-paths/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/invalid-repos/bad-logical-paths/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/invalid-repos/bad-logical-paths/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/invalid-repos/invalid-sidecar-digest/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/invalid-repos/invalid-sidecar-digest/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/invalid-repos/invalid-sidecar-digest/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/invalid-repos/missing-inventory/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/invalid-repos/missing-inventory/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/invalid-repos/missing-inventory/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/invalid-repos/missing-sidecar/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/invalid-repos/missing-sidecar/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/invalid-repos/missing-sidecar/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/invalid-repos/missing-version/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/invalid-repos/missing-version/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/invalid-repos/missing-version/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/sources/repos/different-content/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/sources/repos/different-content/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/sources/repos/different-content/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/sources/repos/different-digest/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/sources/repos/different-digest/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/sources/repos/different-digest/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/sources/repos/invalid-digest-algorithm/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/sources/repos/invalid-digest-algorithm/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/sources/repos/invalid-digest-algorithm/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/sources/repos/invalid-file-fixity/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/sources/repos/invalid-file-fixity/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/sources/repos/invalid-file-fixity/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/sources/repos/invalid-inventory-fixity/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/sources/repos/invalid-inventory-fixity/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/sources/repos/invalid-inventory-fixity/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` diff --git a/ocfl-java-itest/src/test/resources/sources/repos/zero-padded/0004-hashed-n-tuple-storage-layout.md b/ocfl-java-itest/src/test/resources/sources/repos/zero-padded/0004-hashed-n-tuple-storage-layout.md new file mode 100644 index 00000000..ed29121c --- /dev/null +++ b/ocfl-java-itest/src/test/resources/sources/repos/zero-padded/0004-hashed-n-tuple-storage-layout.md @@ -0,0 +1,303 @@ +# OCFL Community Extension 0004: Hashed N-tuple Storage Layout + +* **Extension Name:** 0004-hashed-n-tuple-storage-layout +* **Authors:** Peter Winckles +* **Minimum OCFL Version:** 1.0 +* **OCFL Community Extensions Version:** 1.0 +* **Obsoletes:** n/a +* **Obsoleted by:** n/a + +## Overview + +This storage root extension describes how to safely map OCFL object identifiers +of any length, containing any characters to OCFL object root directories with +the primary goals of ensuring portability and filesystem performance at the cost +of directory name transparency. + +Using this extension, OCFL object identifiers are hashed and encoded +as lowercase hex strings. These digests are then divided into _N_ +n-tuple segments, which are used to create nested paths under the OCFL +storage root. + +This approach allows OCFL object identifiers of any composition to be evenly +distributed across the storage hierarchy. The maximum number of files under any +given directory is controlled by the number of characters in each n-tuple, and +the tree depth is controlled by the number of n-tuple segments each digest is +divided into. Additionally, it obviates the need to handle special characters in +OCFL object identifiers because the mapped directory names will only ever +contain the characters `0-9a-f`. + +However, this comes at the cost of not being able to identify the OCFL object +identifier of an object simply by browsing the OCFL storage hierarchy. The ID of +an object may only be found within its `inventory.json`. + +## Parameters + +### Summary + +* **Name:** `digestAlgorithm` + * **Description:** The digest algorithm to apply on the OCFL object + identifier; MUST be an algorithm that is allowed in the OCFL fixity block + * **Type:** string + * **Constraints:** Must not be empty + * **Default:** sha256 +* **Name**: `tupleSize` + * **Description:** Indicates the segment size (in characters) to split the + digest is split into + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `numberOfTuples` + * **Description:** Indicates the number of segments to use for path generation + * **Type:** number + * **Constraints:** An integer between 0 and 32 inclusive + * **Default:** 3 +* **Name:** `shortObjectRoot` + * **Description:** When true, indicates that the OCFL object root directory + name should contain the remainder of the digest not used in the n-tuple + segments + * **Type:** boolean + * **Default:** false + +### Details + +#### digestAlgorithm + +`digestAlgorithm` is defaulted to `sha256`, and it MUST either contain a digest +algorithm that's [officially supported by the OCFL +spec](https://ocfl.io/draft/spec/#digest-algorithms) or defined in a community +extension. The specified algorithm is applied to OCFL object identifiers to +produce hex encoded digest values that are then mapped to OCFL object root +paths. + +#### tupleSize + +`tupleSize` determines the number of digest characters to include in +each tuple. The tuples are used as directory names. The default value +is `3`, which means that each intermediate directory in the OCFL +storage hierarchy could contain up to 4096 sub-directories. Increasing +this value increases the maximum number of sub-directories per +directory. + +If `tupleSize` is set to `0`, then no tuples are created and `numberOfTuples` +MUST also equal `0`. + +The product of `tupleSize` and `numberOfTuples` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### numberOfTuples + +`numberOfTuples` determines how many tuples to create from the digest. The +tuples are used as directory names, and each successive directory is nested +within the previous. The default value is `3`, which means that every OCFL +object root will be 4 directories removed from the OCFL storage root, 3 tuple +directories plus 1 encapsulation directory. Increasing this value increases the +depth of the OCFL storage hierarchy. + +If `numberOfTuples` is set to `0`, then no tuples are created and `tupleSize` +MUST also equal `0`. + +The product of `numberOfTuples` and `tupleSize` MUST be less than or equal to +the number of characters in the hex encoded digest. + +#### shortObjectRoot + +The directory that immediately encapsulates an OCFL object MUST either be named +using the entire digest or the remainder of the digest that was not used in a +tuple. When `shortObjectRoot` is set to `false`, the default, the entire digest +is used, and, when it's `true` only the previously unused remainder is used. + +If the product of `tupleSize` and `numberOfTuples` is equal to the number of +characters in the hex encoded digest, then `shortObjectRoot` MUST be `false`. + +## Procedure + +The following is an outline of the steps to map an OCFL object identifier to an +OCFL object root path: + +1. The OCFL object identifier, UTF-8 encoded, is hashed using the specified + `digestAlgorithm`. +2. The digest is encoded as a lowercase hex string. +3. Starting at the beginning of the digest and working forwards, the digest is + divided into `numberOfTuples` tuples each containing `tupleSize` characters. +4. The tuples are joined, in order, using the filesystem path separator. +5. If `shortObjectRoot` is `true`, the remaining, unused portion of the digest + is joined on the end of this path. Otherwise, the entire digest is joined on + the end. + +## Examples + +### Example 1 + +This example demonstrates what the OCFL storage hierarchy looks like when using +the default configuration. + +#### Parameters + +It is not necessary to specify any parameters to use the default configuration. +However, if you were to do so, it would look like the following: + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 3, + "numberOfTuples": 3, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0/ff4/240/3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487/326/d8c/487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0/ +│ └── ff4/ +│ └── 240/ +│ └── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487/ + └── 326/ + └── d8c/ + └── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 2 + +This example demonstrates the effects of modifying the default parameters to use +a different `digestAlgoirthm`, smaller `tupleSize`, and a larger +`numberOfTuples`. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "md5", + "tupleSize": 2, + "numberOfTuples": 15, + "shortObjectRoot": true +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | ff75534492485eabb39f86356728884e | `ff/75/53/44/92/48/5e/ab/b3/9f/86/35/67/28/88/4e` | +| ..hor/rib:le-$id | 08319766fb6c2935dd175b94267717e0 | `08/31/97/66/fb/6c/29/35/dd/17/5b/94/26/77/17/e0` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 08/ +│ └── 31/ +│ └── 97/ +│ └── 66/ +│ └── fb/ +│ └── 6c/ +│ └── 29/ +│ └── 35/ +│ └── dd/ +│ └── 17/ +│ └── 5b/ +│ └── 94/ +│ └── 26/ +│ └── 77/ +│ └── 17/ +│ └── e0/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── ff/ + └── 75/ + └── 53/ + └── 44/ + └── 92/ + └── 48/ + └── 5e/ + └── ab/ + └── b3/ + └── 9f/ + └── 86/ + └── 35/ + └── 67/ + └── 28/ + └── 88/ + └── 4e/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +``` + +### Example 3 + +This example demonstrates what happens when `tupleSize` and `numberOfTuples` are +set to `0`. This is an edge case and not a recommended configuration. + +#### Parameters + +```json +{ + "extensionName": "0004-hashed-n-tuple-storage-layout", + "digestAlgorithm": "sha256", + "tupleSize": 0, + "numberOfTuples": 0, + "shortObjectRoot": false +} +``` + +#### Mappings + +| Object ID | Digest | Object Root Path | +| --------- | ------ | ---------------- | +| object-01 | 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4 | `3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4` | +| ..hor/rib:le-$id | 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d | `487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d` | + +#### Storage Hierarchy + +``` +[storage_root]/ +├── 0=ocfl_1.0 +├── ocfl_layout.json +├── extensions/ +│ └── 0004-hashed-n-tuple-storage-layout/ +│ └── config.json +├── 3c0ff4240c1e116dba14c7627f2319b58aa3d77606d0d90dfc6161608ac987d4/ +│ ├── 0=ocfl_object_1.0 +│ ├── inventory.json +│ ├── inventory.json.sha512 +│ └── v1 [...] +└── 487326d8c2a3c0b885e23da1469b4d6671fd4e76978924b4443e9e3c316cda6d/ + ├── 0=ocfl_object_1.0 + ├── inventory.json + ├── inventory.json.sha512 + └── v1 [...] +```