Skip to content

Commit

Permalink
Merge pull request #8934 from rouault/parquet_metadata
Browse files Browse the repository at this point in the history
Parquet: support reading and writing layer metadata
  • Loading branch information
rouault authored Dec 8, 2023
2 parents ae830f5 + 9ffd8d3 commit 363fb81
Show file tree
Hide file tree
Showing 10 changed files with 180 additions and 11 deletions.
21 changes: 21 additions & 0 deletions autotest/ogr/ogr_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -3283,3 +3283,24 @@ def test_ogr_parquet_write_to_mem(tmp_vsimem, where):
"dict",
) and "nan" not in str(src_f.GetField(j)):
assert src_f.GetField(j) == f.GetField(j), field_name


###############################################################################


@gdaltest.enable_exceptions()
def test_ogr_parquet_metadata(tmp_vsimem):

outfilename = str(tmp_vsimem / "test_ogr_parquet_metadata.parquet")
ds = ogr.GetDriverByName("Parquet").CreateDataSource(outfilename)
lyr = ds.CreateLayer("test", geom_type=ogr.wkbNone)
lyr.SetMetadataItem("foo", "bar")
lyr.SetMetadata(['{"foo":["bar","baz"]}'], "json:test")
lyr.SetMetadata(["<foo/>"], "xml:test")
ds = None

ds = ogr.Open(outfilename)
lyr = ds.GetLayer(0)
assert lyr.GetMetadata_Dict() == {"foo": "bar"}
assert lyr.GetMetadata_List("json:test")[0] == '{"foo":["bar","baz"]}'
assert lyr.GetMetadata_List("xml:test")[0] == "<foo/>"
10 changes: 9 additions & 1 deletion doc/source/drivers/vector/parquet.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ This driver also supports geometry columns using the GeoParquet specification.

.. note:: The driver should be considered experimental as the GeoParquet specification is not finalized yet.

The GeoParquet 1.0.0-beta1 specification is supported since GDAL 3.6.2
The GeoParquet 1.0.0 specification is supported since GDAL 3.8.0

Driver capabilities
-------------------
Expand Down Expand Up @@ -131,6 +131,14 @@ if the driver is built against the ``arrowdataset`` C++ library.

Note that no optimization is currently done regarding filtering.

Metadata
--------

.. versionadded:: 3.9.0

Layer metadata can be read and written. It is serialized as JSON content in a
``gdal:metadata`` domain.

Multithreading
--------------

Expand Down
2 changes: 1 addition & 1 deletion ogr/ogrsf_frmts/arrow/ogrfeatherlayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ void OGRFeatherLayer::EstablishFeatureDefn()
LoadGeoMetadata(kv_metadata.get(), "geo");
}
const auto oMapFieldNameToGDALSchemaFieldDefn =
LoadGDALMetadata(kv_metadata.get());
LoadGDALSchema(kv_metadata.get());

const auto fields = m_poSchema->fields();
for (int i = 0; i < m_poSchema->num_fields(); ++i)
Expand Down
4 changes: 3 additions & 1 deletion ogr/ogrsf_frmts/arrow_common/ogr_arrow.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,9 @@ class OGRArrowLayer CPL_NON_FINAL
std::vector<Constraint> m_asAttributeFilterConstraints{};

std::map<std::string, std::unique_ptr<OGRFieldDefn>>
LoadGDALMetadata(const arrow::KeyValueMetadata *kv_metadata);
LoadGDALSchema(const arrow::KeyValueMetadata *kv_metadata);

void LoadGDALMetadata(const arrow::KeyValueMetadata *kv_metadata);

OGRArrowLayer(OGRArrowDataset *poDS, const char *pszLayerName);

Expand Down
60 changes: 58 additions & 2 deletions ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,11 @@ inline OGRArrowLayer::~OGRArrowLayer()
}

/************************************************************************/
/* LoadGDALMetadata() */
/* LoadGDALSchema() */
/************************************************************************/

inline std::map<std::string, std::unique_ptr<OGRFieldDefn>>
OGRArrowLayer::LoadGDALMetadata(const arrow::KeyValueMetadata *kv_metadata)
OGRArrowLayer::LoadGDALSchema(const arrow::KeyValueMetadata *kv_metadata)
{
std::map<std::string, std::unique_ptr<OGRFieldDefn>>
oMapFieldNameToGDALSchemaFieldDefn;
Expand Down Expand Up @@ -164,6 +164,62 @@ OGRArrowLayer::LoadGDALMetadata(const arrow::KeyValueMetadata *kv_metadata)
return oMapFieldNameToGDALSchemaFieldDefn;
}

/************************************************************************/
/* LoadGDALMetadata() */
/************************************************************************/

inline void
OGRArrowLayer::LoadGDALMetadata(const arrow::KeyValueMetadata *kv_metadata)
{
if (kv_metadata && kv_metadata->Contains("gdal:metadata"))
{
auto gdalMetadata = kv_metadata->Get("gdal:metadata");
if (gdalMetadata.ok())
{
CPLJSONDocument oDoc;
if (oDoc.LoadMemory(*gdalMetadata))
{
auto oRoot = oDoc.GetRoot();
for (auto oDomain : oRoot.GetChildren())
{
if (STARTS_WITH(oDomain.GetName().c_str(), "json:") &&
oDomain.GetType() == CPLJSONObject::Type::Object)
{
char **papszMD = nullptr;
papszMD = CSLAddString(
papszMD,
oDomain.Format(CPLJSONObject::PrettyFormat::Plain)
.c_str());
SetMetadata(papszMD, oDomain.GetName().c_str());
CSLDestroy(papszMD);
}
else if (STARTS_WITH(oDomain.GetName().c_str(), "xml:") &&
oDomain.GetType() == CPLJSONObject::Type::String)
{
char **papszMD = nullptr;
papszMD =
CSLAddString(papszMD, oDomain.ToString().c_str());
SetMetadata(papszMD, oDomain.GetName().c_str());
CSLDestroy(papszMD);
}
else
{
for (auto oItem : oDomain.GetChildren())
{
if (oItem.GetType() == CPLJSONObject::Type::String)
{
SetMetadataItem(oItem.GetName().c_str(),
oItem.ToString().c_str(),
oDomain.GetName().c_str());
}
}
}
}
}
}
}
}

/************************************************************************/
/* IsIntegerArrowType() */
/************************************************************************/
Expand Down
12 changes: 11 additions & 1 deletion ogr/ogrsf_frmts/parquet/ogr_parquet.h
Original file line number Diff line number Diff line change
Expand Up @@ -239,11 +239,14 @@ class OGRParquetDataset final : public OGRArrowDataset
/* OGRParquetWriterLayer */
/************************************************************************/

class OGRParquetWriterDataset;

class OGRParquetWriterLayer final : public OGRArrowWriterLayer
{
OGRParquetWriterLayer(const OGRParquetWriterLayer &) = delete;
OGRParquetWriterLayer &operator=(const OGRParquetWriterLayer &) = delete;

OGRParquetWriterDataset *m_poDataset = nullptr;
std::unique_ptr<parquet::arrow::FileWriter> m_poFileWriter{};
std::shared_ptr<const arrow::KeyValueMetadata> m_poKeyValueMetadata{};
bool m_bForceCounterClockwiseOrientation = false;
Expand Down Expand Up @@ -282,12 +285,14 @@ class OGRParquetWriterLayer final : public OGRArrowWriterLayer

public:
OGRParquetWriterLayer(
arrow::MemoryPool *poMemoryPool,
OGRParquetWriterDataset *poDS, arrow::MemoryPool *poMemoryPool,
const std::shared_ptr<arrow::io::OutputStream> &poOutputStream,
const char *pszLayerName);

~OGRParquetWriterLayer() override;

CPLErr SetMetadata(char **papszMetadata, const char *pszDomain) override;

bool SetOptions(CSLConstList papszOptions,
const OGRSpatialReference *poSpatialRef,
OGRwkbGeometryType eGType);
Expand Down Expand Up @@ -356,6 +361,11 @@ class OGRParquetWriterDataset final : public GDALPamDataset
bool AddFieldDomain(std::unique_ptr<OGRFieldDomain> &&domain,
std::string &failureReason) override;

GDALMultiDomainMetadata &GetMultiDomainMetadata()
{
return oMDMD;
}

protected:
OGRLayer *ICreateLayer(const char *pszName,
const OGRSpatialReference *poSpatialRef = nullptr,
Expand Down
4 changes: 3 additions & 1 deletion ogr/ogrsf_frmts/parquet/ogrparquetdatasetlayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ void OGRParquetDatasetLayer::EstablishFeatureDefn()

LoadGeoMetadata(kv_metadata);
const auto oMapFieldNameToGDALSchemaFieldDefn =
LoadGDALMetadata(kv_metadata.get());
LoadGDALSchema(kv_metadata.get());

LoadGDALMetadata(kv_metadata.get());

const auto fields = m_poSchema->fields();
for (int i = 0; i < m_poSchema->num_fields(); ++i)
Expand Down
4 changes: 3 additions & 1 deletion ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,9 @@ void OGRParquetLayer::EstablishFeatureDefn()

LoadGeoMetadata(kv_metadata);
const auto oMapFieldNameToGDALSchemaFieldDefn =
LoadGDALMetadata(kv_metadata.get());
LoadGDALSchema(kv_metadata.get());

LoadGDALMetadata(kv_metadata.get());

if (!m_poArrowReader->GetSchema(&m_poSchema).ok())
{
Expand Down
2 changes: 1 addition & 1 deletion ogr/ogrsf_frmts/parquet/ogrparquetwriterdataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ OGRLayer *OGRParquetWriterDataset::ICreateLayer(
return nullptr;
}
m_poLayer = std::make_unique<OGRParquetWriterLayer>(
m_poMemoryPool.get(), m_poOutputStream, pszName);
this, m_poMemoryPool.get(), m_poOutputStream, pszName);
if (!m_poLayer->SetOptions(papszOptions, poSpatialRef, eGType))
{
m_poLayer.reset();
Expand Down
72 changes: 70 additions & 2 deletions ogr/ogrsf_frmts/parquet/ogrparquetwriterlayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,11 @@
/************************************************************************/

OGRParquetWriterLayer::OGRParquetWriterLayer(
arrow::MemoryPool *poMemoryPool,
OGRParquetWriterDataset *poDataset, arrow::MemoryPool *poMemoryPool,
const std::shared_ptr<arrow::io::OutputStream> &poOutputStream,
const char *pszLayerName)
: OGRArrowWriterLayer(poMemoryPool, poOutputStream, pszLayerName)
: OGRArrowWriterLayer(poMemoryPool, poOutputStream, pszLayerName),
m_poDataset(poDataset)
{
m_bWriteFieldArrowExtensionName = CPLTestBool(
CPLGetConfigOption("OGR_PARQUET_WRITE_ARROW_EXTENSION_NAME", "NO"));
Expand Down Expand Up @@ -567,6 +568,59 @@ void OGRParquetWriterLayer::PerformStepsBeforeFinalFlushGroup()
->Append(kArrowSchemaKey, schema_base64);
}
}

// Put GDAL metadata into a gdal:metadata domain
CPLJSONObject oMultiMetadata;
bool bHasMultiMetadata = false;
auto &l_oMDMD = oMDMD.GetDomainList() && *(oMDMD.GetDomainList())
? oMDMD
: m_poDataset->GetMultiDomainMetadata();
for (CSLConstList papszDomainIter = l_oMDMD.GetDomainList();
papszDomainIter && *papszDomainIter; ++papszDomainIter)
{
const char *pszDomain = *papszDomainIter;
CSLConstList papszMD = l_oMDMD.GetMetadata(pszDomain);
if (STARTS_WITH(pszDomain, "json:") && papszMD && papszMD[0])
{
CPLJSONDocument oDoc;
if (oDoc.LoadMemory(papszMD[0]))
{
bHasMultiMetadata = true;
oMultiMetadata.Add(pszDomain, oDoc.GetRoot());
continue;
}
}
else if (STARTS_WITH(pszDomain, "xml:") && papszMD && papszMD[0])
{
bHasMultiMetadata = true;
oMultiMetadata.Add(pszDomain, papszMD[0]);
continue;
}
CPLJSONObject oMetadata;
bool bHasMetadata = false;
for (CSLConstList papszMDIter = papszMD;
papszMDIter && *papszMDIter; ++papszMDIter)
{
char *pszKey = nullptr;
const char *pszValue = CPLParseNameValue(*papszMDIter, &pszKey);
if (pszKey && pszValue)
{
bHasMetadata = true;
bHasMultiMetadata = true;
oMetadata.Add(pszKey, pszValue);
}
CPLFree(pszKey);
}
if (bHasMetadata)
oMultiMetadata.Add(pszDomain, oMetadata);
}
if (bHasMultiMetadata)
{
const_cast<arrow::KeyValueMetadata *>(m_poKeyValueMetadata.get())
->Append(
"gdal:metadata",
oMultiMetadata.Format(CPLJSONObject::PrettyFormat::Plain));
}
}
}

Expand Down Expand Up @@ -818,3 +872,17 @@ bool OGRParquetWriterLayer::IsArrowSchemaSupported(
return true;
}
#endif

/************************************************************************/
/* SetMetadata() */
/************************************************************************/

CPLErr OGRParquetWriterLayer::SetMetadata(char **papszMetadata,
const char *pszDomain)
{
if (!pszDomain || !EQUAL(pszDomain, "SHAPEFILE"))
{
return OGRLayer::SetMetadata(papszMetadata, pszDomain);
}
return CE_None;
}

0 comments on commit 363fb81

Please sign in to comment.