Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for variable length string arrays #16

Merged
merged 18 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 48 additions & 34 deletions src/main/java/org/janelia/saalfeldlab/n5/zarr/DType.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@
*/
package org.janelia.saalfeldlab.n5.zarr;

import java.lang.reflect.Type;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.EnumMap;

import org.janelia.saalfeldlab.n5.ByteArrayDataBlock;
Expand All @@ -42,13 +43,7 @@
import org.janelia.saalfeldlab.n5.LongArrayDataBlock;
import org.janelia.saalfeldlab.n5.ShortArrayDataBlock;

import com.google.gson.JsonDeserializationContext;
import com.google.gson.JsonDeserializer;
import com.google.gson.JsonElement;
import com.google.gson.JsonParseException;
import com.google.gson.JsonPrimitive;
import com.google.gson.JsonSerializationContext;
import com.google.gson.JsonSerializer;
import static org.janelia.saalfeldlab.n5.zarr.Filter.VLEN_UTF8;

/**
* Enumerates available zarr data types as defined at
Expand All @@ -73,6 +68,8 @@ public class DType {
typestrs.put(DataType.UINT64, ">u8");
typestrs.put(DataType.FLOAT32, ">f4");
typestrs.put(DataType.FLOAT64, ">f8");
typestrs.put(DataType.STRING, "|O");
typestrs.put(DataType.OBJECT, "|O");
}

public static enum Primitive {
Expand Down Expand Up @@ -128,13 +125,13 @@ public static Primitive fromCode(final char code) {
/* the closest possible N5 DataType */
protected final DataType dataType;

public DType(final String typestr) {
public DType(final String typestr, final Collection<Filter> filters) {

this.typestr = typestr;

order = typestr.charAt(0) == '<' ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN;
final Primitive primitive = Primitive.fromCode(typestr.charAt(1));
final int nB = Integer.parseInt(typestr.substring(2));
final int nB = (primitive == Primitive.OBJECT) ? 0 : Integer.parseInt(typestr.substring(2));

switch (primitive) {
case BIT:
Expand Down Expand Up @@ -211,8 +208,20 @@ public DType(final String typestr) {
byteBlockFactory = (blockSize, gridPosition, numElements) ->
new ByteArrayDataBlock(blockSize, gridPosition, new byte[numElements * nBytes]);
break;
case OBJECT:
nBytes = 1;
nBits = 0;
if (filters.contains(VLEN_UTF8)) {
dataBlockFactory = (blockSize, gridPosition, numElements) ->
new ZarrStringDataBlock(blockSize, gridPosition, new String[0]);
byteBlockFactory = (blockSize, gridPosition, numElements) ->
new ByteArrayDataBlock(blockSize, gridPosition, new byte[numElements * nBytes]);
} else {
dataBlockFactory = null;
byteBlockFactory = null;
}
break;
// case BOOLEAN:
// case OBJECT: // not sure about this
// case OTHER: // not sure about this
// case STRING: // not sure about this
// case UNICODE: // not sure about this
Expand All @@ -227,7 +236,7 @@ public DType(final String typestr) {
new ByteArrayDataBlock(blockSize, gridPosition, new byte[numElements * nBytes]);
}

dataType = getDataType(primitive, nBytes);
dataType = getDataType(primitive, nBytes, filters);
}

public DType(final DataType dataType, final int nPrimitives) {
Expand Down Expand Up @@ -269,6 +278,11 @@ public DType(final DataType dataType, final int nPrimitives) {
break;
// case INT8:
// case UINT8:
case STRING:
nBytes = 1;
dataBlockFactory = (blockSize, gridPosition, numElements) ->
new ZarrStringDataBlock(blockSize, gridPosition, new String[0]);
break;
default:
nBytes = nPrimitives;
dataBlockFactory = (blockSize, gridPosition, numElements) ->
Expand All @@ -290,7 +304,8 @@ public DataType getDataType() {

protected final static DataType getDataType(
final Primitive primitive,
final int nBytes) {
final int nBytes,
final Collection<Filter> filters) {

switch (primitive) {
case INT:
Expand Down Expand Up @@ -333,6 +348,11 @@ protected final static DataType getDataType(
default:
return DataType.UINT8; // fallback
}
case OBJECT:
if (filters.contains(VLEN_UTF8))
return DataType.STRING;
else
return DataType.OBJECT;
default:
return DataType.UINT8; // fallback
}
Expand All @@ -345,6 +365,21 @@ public String toString() {
return typestr;
}

/**
* Returns a list of {@link Filter filters} for the corresponding {@link DType}.
*
* @return list of filters
*/
public Collection<Filter> getFilters() {
if (dataType == DataType.STRING) {
ArrayList<Filter> filterSet = new ArrayList<>();
filterSet.add(VLEN_UTF8);
return filterSet;
}
else
return null;
}

/**
* Factory for {@link DataBlock DataBlocks}.
*
Expand Down Expand Up @@ -406,27 +441,6 @@ private static interface ByteBlockFactory {
public ByteArrayDataBlock createByteBlock(final int[] blockSize, final long[] gridPosition, final int numElements);
}

static public class JsonAdapter implements JsonDeserializer<DType>, JsonSerializer<DType> {

@Override
public DType deserialize(
final JsonElement json,
final Type typeOfT,
final JsonDeserializationContext context) throws JsonParseException {

return new DType(json.getAsString());
}

@Override
public JsonElement serialize(
final DType src,
final Type typeOfSrc,
final JsonSerializationContext context) {

return new JsonPrimitive(src.toString());
}
}

public ByteOrder getOrder() {

return order;
Expand Down
63 changes: 62 additions & 1 deletion src/main/java/org/janelia/saalfeldlab/n5/zarr/Filter.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,74 @@
*/
package org.janelia.saalfeldlab.n5.zarr;

import com.google.gson.JsonDeserializationContext;
import com.google.gson.JsonDeserializer;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParseException;
import com.google.gson.JsonPrimitive;
import com.google.gson.JsonSerializationContext;
import com.google.gson.JsonSerializer;

import java.lang.reflect.Type;

/**
* Place holder interface for filters
* Filter types
*
* TODO implement some
*
* @author Stephan Saalfeld &lt;[email protected]&gt;
* @author Michael Innerberger
*/
public interface Filter {

String getId();

// Note: the JSON (de-)serializer below is very much tailored to this filter, which serializes to "{"id":"vlen-utf8"}"
// If additional filters are implemented, consider also changing the type adapter below
Filter VLEN_UTF8 = new VLenStringFilter();

class VLenStringFilter implements Filter {
private static final String id = "vlen-utf8";
@Override
public String getId() {
return id;
}
};

static Filter fromString(final String id) {
if (VLEN_UTF8.getId().equals(id))
return VLEN_UTF8;
return null;
}

JsonAdapter jsonAdapter = new JsonAdapter();

class JsonAdapter implements JsonDeserializer<Filter>, JsonSerializer<Filter> {

@Override
public Filter deserialize(
final JsonElement json,
final Type typeOfT,
final JsonDeserializationContext context) throws JsonParseException {

final JsonElement jsonId = json.getAsJsonObject().get("id");
if (jsonId == null)
return null;

final String stringId = jsonId.getAsString();
return Filter.fromString(stringId);
}

@Override
public JsonElement serialize(
final Filter filter,
final Type typeOfSrc,
final JsonSerializationContext context) {

final JsonObject serialization = new JsonObject();
serialization.add("id", new JsonPrimitive(filter.getId()));
return serialization;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ public class N5ZarrReader extends ZarrKeyValueReader {
* If true, fields from .zgroup, .zarray, and .zattrs will be merged
* when calling getAttributes, and variants of getAttribute
* @param cacheMeta cache attributes and meta data
* @param cacheMeta cache attributes and meta data
* Setting this to true avoids frequent reading and parsing of JSON
* encoded attributes and other meta data that requires accessing the
* store. This is most interesting for high latency backends. Changes
Expand All @@ -81,7 +80,6 @@ public N5ZarrReader(final String basePath,
new FileSystemKeyValueAccess(FileSystems.getDefault()),
basePath,
gsonBuilder
.registerTypeAdapter(DType.class, new DType.JsonAdapter())
.registerTypeAdapter(ZarrCompressor.class, ZarrCompressor.jsonAdapter),
mapN5DatasetAttributes,
mergeAttributes,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,5 +208,4 @@ public N5ZarrWriter(final String basePath) throws N5Exception {
this(basePath, new GsonBuilder());
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -215,16 +215,20 @@ public ZArrayAttributes deserialize(JsonElement json, Type typeOfT, JsonDeserial
final JsonObject obj = json.getAsJsonObject();
final JsonElement sepElem = obj.get("dimension_separator");
try {
final Collection<Filter> filters = context.deserialize(obj.get("filters"), TypeToken.getParameterized(Collection.class, Filter.class).getType());
final String typestr = context.deserialize(obj.get("dtype"), String.class);
final DType dType = new DType(typestr, filters);

return new ZArrayAttributes(
obj.get("zarr_format").getAsInt(),
context.deserialize( obj.get("shape"), long[].class),
context.deserialize( obj.get("chunks"), int[].class),
context.deserialize( obj.get("dtype"), DType.class), // fix
dType, // fix
context.deserialize( obj.get("compressor"), ZarrCompressor.class), // fix
obj.get("fill_value").getAsString(),
obj.get("order").getAsCharacter(),
sepElem != null ? sepElem.getAsString() : ".",
context.deserialize( obj.get("filters"), TypeToken.getParameterized(Collection.class, Filter.class).getType()));
filters);
} catch (Exception e) {
return null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
*/
package org.janelia.saalfeldlab.n5.zarr;

import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UncheckedIOException;
Expand All @@ -34,6 +36,7 @@
import java.nio.ByteBuffer;
import java.util.Arrays;

import org.apache.commons.compress.utils.IOUtils;
import org.janelia.saalfeldlab.n5.BlockReader;
import org.janelia.saalfeldlab.n5.ByteArrayDataBlock;
import org.janelia.saalfeldlab.n5.CachedGsonKeyValueN5Reader;
Expand All @@ -42,6 +45,7 @@
import org.janelia.saalfeldlab.n5.DataBlock;
import org.janelia.saalfeldlab.n5.DataType;
import org.janelia.saalfeldlab.n5.DatasetAttributes;
import org.janelia.saalfeldlab.n5.DefaultBlockReader;
import org.janelia.saalfeldlab.n5.GsonUtils;
import org.janelia.saalfeldlab.n5.KeyValueAccess;
import org.janelia.saalfeldlab.n5.LockedChannel;
Expand All @@ -50,6 +54,7 @@
import org.janelia.saalfeldlab.n5.N5Reader;
import org.janelia.saalfeldlab.n5.N5URI;
import org.janelia.saalfeldlab.n5.RawCompression;
import org.janelia.saalfeldlab.n5.blosc.BloscCompression;
import org.janelia.saalfeldlab.n5.cache.N5JsonCacheableContainer;
import org.janelia.saalfeldlab.n5.zarr.cache.ZarrJsonCache;

Expand Down Expand Up @@ -498,8 +503,7 @@ protected JsonElement zarrToN5DatasetAttributes(final JsonElement elem ) {

attrs.add(DatasetAttributes.DIMENSIONS_KEY, attrs.get(ZArrayAttributes.shapeKey));
attrs.add(DatasetAttributes.BLOCK_SIZE_KEY, attrs.get(ZArrayAttributes.chunksKey));
attrs.addProperty(DatasetAttributes.DATA_TYPE_KEY,
new DType(attrs.get(ZArrayAttributes.dTypeKey).getAsString()).getDataType().toString());
attrs.addProperty(DatasetAttributes.DATA_TYPE_KEY, zattrs.getDType().getDataType().toString());

JsonElement e = attrs.get(ZArrayAttributes.compressorKey);
if (e == JsonNull.INSTANCE) {
Expand Down Expand Up @@ -659,6 +663,11 @@ protected static DataBlock<?> readBlock(

final ByteArrayDataBlock byteBlock = dType.createByteBlock(blockSize, gridPosition);
final BlockReader reader = datasetAttributes.getCompression().getReader();

if (dType.getDataType() == DataType.STRING) {
return readVLenStringBlock(in, reader, byteBlock);
}

reader.read(byteBlock, in);

switch (dType.getDataType()) {
Expand All @@ -676,6 +685,26 @@ protected static DataBlock<?> readBlock(
return dataBlock;
}

private static ZarrStringDataBlock readVLenStringBlock(InputStream in, BlockReader reader, ByteArrayDataBlock byteBlock) throws IOException {
// read whole chunk and deserialize; this should be improved
ZarrStringDataBlock dataBlock = new ZarrStringDataBlock(byteBlock.getSize(), byteBlock.getGridPosition(), new String[0]);
if (reader instanceof BloscCompression) {
// Blosc reader reads actual data and doesn't care about buffer size (but needs special treatment in data block)
reader.read(dataBlock, in);
} else if (reader instanceof DefaultBlockReader) {
try (final InputStream inflater = ((DefaultBlockReader) reader).getInputStream(in)) {
final DataInputStream dis = new DataInputStream(inflater);
final ByteArrayOutputStream out = new ByteArrayOutputStream();
IOUtils.copy(dis, out);
dataBlock.readData(ByteBuffer.wrap(out.toByteArray()));
}
}
else {
throw new UnsupportedOperationException("Only Blosc compression or algorithms that use DefaultBlockReader are supported.");
}
return dataBlock;
}

/**
* Constructs the path for a data block in a dataset at a given grid
* position.
Expand Down Expand Up @@ -806,12 +835,12 @@ static Gson registerGson(final GsonBuilder gsonBuilder) {

protected static GsonBuilder addTypeAdapters(final GsonBuilder gsonBuilder) {

gsonBuilder.registerTypeAdapter(DType.class, new DType.JsonAdapter());
gsonBuilder.registerTypeAdapter(DataType.class, new DataType.JsonAdapter());
gsonBuilder.registerTypeAdapter(ZarrCompressor.class, ZarrCompressor.jsonAdapter);
gsonBuilder.registerTypeHierarchyAdapter(Compression.class, CompressionAdapter.getJsonAdapter());
gsonBuilder.registerTypeAdapter(Compression.class, CompressionAdapter.getJsonAdapter());
gsonBuilder.registerTypeAdapter(ZArrayAttributes.class, ZArrayAttributes.jsonAdapter);
gsonBuilder.registerTypeHierarchyAdapter(Filter.class, Filter.jsonAdapter);
gsonBuilder.disableHtmlEscaping();
gsonBuilder.serializeNulls();

Expand Down
Loading