From 1fb82fddb5abf331aff4fb6972a3aa3ddc358ed8 Mon Sep 17 00:00:00 2001 From: Luc Grosheintz Date: Mon, 4 Nov 2024 10:22:02 +0100 Subject: [PATCH] backport: Allow reading not-quite null-terminated strings. For fixed-length strings that have a padding mode that suggests they're null-terminated are in fact not required to be null-terminated, we (silently) fail to read the last character. Since, HDF5 will create such string, h5dump will display the file and h5py will read the string; we opted to allow it and also read the string into and `std::string` (which is variable length and already not guaranteed to have the same length as the fixed length string). HighFive will continue to not allow writing null-terminated strings that aren't via `write`. Backports: (#1056) --- include/highfive/H5DataType.hpp | 20 ++++++++-------- include/highfive/bits/H5Converter_misc.hpp | 16 +++++++------ tests/unit/tests_high_five_base.cpp | 27 ++++++++++++++++++++++ 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/include/highfive/H5DataType.hpp b/include/highfive/H5DataType.hpp index 68921c249..c06b7a733 100644 --- a/include/highfive/H5DataType.hpp +++ b/include/highfive/H5DataType.hpp @@ -153,16 +153,18 @@ class FixedLengthStringType: public StringType { /// UTF8. In particular, a string with `n` UFT8 characters in general /// requires `4*n` bytes. /// - /// The string padding is subtle, essentially it's just a hint. A - /// null-terminated string is guaranteed to have one `'\0'` which marks the - /// semantic end of the string. The length of the buffer must be at least - /// `size` bytes regardless. HDF5 will read or write `size` bytes, - /// irrespective of the when the `\0` occurs. - /// - /// Note that when writing passing `StringPadding::NullTerminated` is a + /// The string padding is subtle, essentially it's just a hint. While + /// commonly, a null-terminated string is guaranteed to have one `'\0'` + /// which marks the semantic end of the string, this is not enforced by + /// HDF5. In fact, there are HDF5 files that contain strings that claim to + /// be null-terminated but aren't. The length of the buffer must be at + /// least `size` bytes regardless of the padding. HDF5 will read or write + /// `size` bytes, irrespective of when (if at all) the `\0` occurs. + /// + /// Note that when writing, passing `StringPadding::NullTerminated` is a /// guarantee to the reader that it contains a `\0`. Therefore, make sure - /// that the string really is nullterminated. Otherwise prefer a - /// null-padded string which only means states that the buffer is filled up + /// that the string really is null-terminated. Otherwise prefer a + /// null-padded string. This mearly states that the buffer is filled up /// with 0 or more `\0`. FixedLengthStringType(size_t size, StringPadding padding, diff --git a/include/highfive/bits/H5Converter_misc.hpp b/include/highfive/bits/H5Converter_misc.hpp index ed387702f..df6b89afc 100644 --- a/include/highfive/bits/H5Converter_misc.hpp +++ b/include/highfive/bits/H5Converter_misc.hpp @@ -190,7 +190,7 @@ struct StringBuffer { } else if (buffer.isFixedLengthString()) { // If the buffer is fixed-length and null-terminated, then // `buffer.string_length` doesn't include the null-character. - if (length > buffer.string_length) { + if (length > buffer.string_max_length) { throw std::invalid_argument("String length too big."); } @@ -229,9 +229,9 @@ struct StringBuffer { /// `length() + 1` bytes long. size_t length() const { if (buffer.isNullTerminated()) { - return char_buffer_size(data(), buffer.string_length); + return char_buffer_size(data(), buffer.string_size); } else { - return buffer.string_length; + return buffer.string_max_length; } } @@ -272,7 +272,7 @@ struct StringBuffer { : file_datatype(_file_datatype.asStringType()) , padding(file_datatype.getPadding()) , string_size(file_datatype.isVariableStr() ? size_t(-1) : file_datatype.getSize()) - , string_length(string_size - size_t(isNullTerminated())) + , string_max_length(string_size - size_t(isNullTerminated())) , dims(_dims) { if (string_size == 0 && isNullTerminated()) { throw DataTypeException( @@ -322,9 +322,11 @@ struct StringBuffer { private: StringType file_datatype; StringPadding padding; - size_t string_size; // Size of buffer required to store the string. - // Meaningful for fixed length strings only. - size_t string_length; // Semantic length of string. + // Size of buffer required to store the string. + // Meaningful for fixed length strings only. + size_t string_size; + // Maximum length of string. + size_t string_max_length; std::vector dims; std::vector fixed_length_buffer; diff --git a/tests/unit/tests_high_five_base.cpp b/tests/unit/tests_high_five_base.cpp index f7cf67532..c9f94d9f0 100644 --- a/tests/unit/tests_high_five_base.cpp +++ b/tests/unit/tests_high_five_base.cpp @@ -2318,6 +2318,33 @@ void check_multiple_string(Proxy proxy, size_t string_length) { } } +template +void check_supposedly_nullterm(HighFive::File& file) { + auto dataspace = HighFive::DataSpace::Scalar(); + auto datatype = HighFive::FixedLengthStringType(5, HighFive::StringPadding::NullTerminated); + auto obj = CreateTraits::create(file, "not_null_terminated", dataspace, datatype); + + // Creates a 5 byte, "null-terminated", fixed-length string. The first five + // bytes are filled with "GROUP". Clearly, this isn't null-terminated. However, + // h5py will read it back as "GROUP", HDF5 allows us to create these; and they're + // found in the wild. + std::string value = "GROUP"; + obj.write_raw(value.c_str(), datatype); + + auto actual = obj.template read(); + REQUIRE(actual == value); +} + +TEST_CASE("HighFiveSTDString (attribute, nullterm cornercase)") { + auto file = HighFive::File("not_null_terminated_attribute.h5", HighFive::File::Truncate); + check_supposedly_nullterm(file); +} + +TEST_CASE("HighFiveSTDString (dataset, nullterm cornercase)") { + auto file = HighFive::File("not_null_terminated_dataset.h5", HighFive::File::Truncate); + check_supposedly_nullterm(file); +} + TEST_CASE("HighFiveSTDString (dataset, single, short)") { File file("std_string_dataset_single_short.h5", File::Truncate); check_single_string(ForwardToDataSet(file), 3);