diff --git a/autotest/cpp/test_ogr.cpp b/autotest/cpp/test_ogr.cpp index 6fe859df02f8..4c5435104ed3 100644 --- a/autotest/cpp/test_ogr.cpp +++ b/autotest/cpp/test_ogr.cpp @@ -3761,4 +3761,200 @@ TEST_F(test_ogr, wkb_polygon_xyzm_discard_lsb_bits) delete poGeom; } +// Test OGRFeature::SerializeToBinary() and DeserializeFromBinary(); +TEST_F(test_ogr, OGRFeature_SerializeToBinary) +{ + { + OGRFeatureDefn oFDefn; + oFDefn.SetGeomType(wkbNone); + oFDefn.Reference(); + + { + OGRFeature oFeatSrc(&oFDefn); + oFeatSrc.SetFID(1); + std::vector abyBuffer; + + EXPECT_TRUE(oFeatSrc.SerializeToBinary(abyBuffer)); + EXPECT_EQ(abyBuffer.size(), 1); + EXPECT_EQ(abyBuffer[0], 1); + + OGRFeature oFeatDst(&oFDefn); + EXPECT_FALSE(oFeatDst.DeserializeFromBinary(abyBuffer.data(), 0)); + EXPECT_TRUE(oFeatDst.DeserializeFromBinary(abyBuffer.data(), + abyBuffer.size())); + EXPECT_EQ(oFeatDst.GetFID(), 1); + } + + { + OGRFeature oFeatSrc(&oFDefn); + oFeatSrc.SetFID(static_cast(-12345678901234)); + std::vector abyBuffer; + + EXPECT_TRUE(oFeatSrc.SerializeToBinary(abyBuffer)); + + OGRFeature oFeatDst(&oFDefn); + // Try truncated buffers + for (size_t i = 0; i < abyBuffer.size(); ++i) + { + EXPECT_FALSE( + oFeatDst.DeserializeFromBinary(abyBuffer.data(), i)); + } + EXPECT_TRUE(oFeatDst.DeserializeFromBinary(abyBuffer.data(), + abyBuffer.size())); + EXPECT_EQ(oFeatDst.GetFID(), static_cast(-12345678901234)); + } + } + + { + OGRFeatureDefn oFDefn; + oFDefn.Reference(); + { + OGRFieldDefn oFieldDefn("int", OFTInteger); + oFDefn.AddFieldDefn(&oFieldDefn); + } + { + OGRFieldDefn oFieldDefn("int64", OFTInteger64); + oFDefn.AddFieldDefn(&oFieldDefn); + } + { + OGRFieldDefn oFieldDefn("real", OFTReal); + oFDefn.AddFieldDefn(&oFieldDefn); + } + { + OGRFieldDefn oFieldDefn("str", OFTString); + oFDefn.AddFieldDefn(&oFieldDefn); + } + { + OGRFieldDefn oFieldDefn("binary", OFTBinary); + oFDefn.AddFieldDefn(&oFieldDefn); + } + { + OGRFieldDefn oFieldDefn("intlist", OFTIntegerList); + oFDefn.AddFieldDefn(&oFieldDefn); + } + { + OGRFieldDefn oFieldDefn("int64list", OFTInteger64List); + oFDefn.AddFieldDefn(&oFieldDefn); + } + { + OGRFieldDefn oFieldDefn("reallist", OFTRealList); + oFDefn.AddFieldDefn(&oFieldDefn); + } + { + OGRFieldDefn oFieldDefn("strlist", OFTStringList); + oFDefn.AddFieldDefn(&oFieldDefn); + } + { + OGRFieldDefn oFieldDefn("date", OFTDate); + oFDefn.AddFieldDefn(&oFieldDefn); + } + { + OGRFieldDefn oFieldDefn("time", OFTTime); + oFDefn.AddFieldDefn(&oFieldDefn); + } + { + OGRFieldDefn oFieldDefn("datetime", OFTDateTime); + oFDefn.AddFieldDefn(&oFieldDefn); + } + + { + OGRFeature oFeatSrc(&oFDefn); + std::vector abyBuffer; + + EXPECT_TRUE(oFeatSrc.SerializeToBinary(abyBuffer)); + EXPECT_EQ(abyBuffer.size(), 5); + + OGRFeature oFeatDst(&oFDefn); + for (size_t i = 0; i < abyBuffer.size(); ++i) + { + EXPECT_FALSE( + oFeatDst.DeserializeFromBinary(abyBuffer.data(), i)); + } + EXPECT_TRUE(oFeatDst.DeserializeFromBinary(abyBuffer.data(), + abyBuffer.size())); + EXPECT_TRUE(oFeatDst.Equal(&oFeatSrc)); + } + + { + OGRFeature oFeatSrc(&oFDefn); + std::vector abyBuffer; + + oFeatSrc.SetFieldNull(oFDefn.GetFieldIndex("int")); + EXPECT_TRUE(oFeatSrc.SerializeToBinary(abyBuffer)); + EXPECT_EQ(abyBuffer.size(), 5); + + OGRFeature oFeatDst(&oFDefn); + + // Try truncated buffers + for (size_t i = 0; i < abyBuffer.size(); ++i) + { + EXPECT_FALSE( + oFeatDst.DeserializeFromBinary(abyBuffer.data(), i)); + } + + EXPECT_TRUE(oFeatDst.DeserializeFromBinary(abyBuffer.data(), + abyBuffer.size())); + EXPECT_TRUE(oFeatDst.Equal(&oFeatSrc)); + } + + { + OGRFeature oFeatSrc(&oFDefn); + oFeatSrc.SetFID(1); + oFeatSrc.SetField("int", -123); + oFeatSrc.SetField("int64", static_cast(-12345678901234)); + oFeatSrc.SetField("real", 1.25); + oFeatSrc.SetField("str", "foo"); + oFeatSrc.SetField(oFDefn.GetFieldIndex("binary"), 3, + static_cast("abc")); + oFeatSrc.SetField("intlist", 2, + std::vector{1, -123456}.data()); + oFeatSrc.SetField("int64list", 2, + std::vector{1, -12345678901234}.data()); + oFeatSrc.SetField("reallist", 2, + std::vector{1.5, -2.5}.data()); + CPLStringList aosList; + aosList.AddString("foo"); + aosList.AddString("barbaz"); + oFeatSrc.SetField("strlist", aosList.List()); + oFeatSrc.SetField("date", 2023, 1, 3); + oFeatSrc.SetField("time", 0, 0, 0, 12, 34, 56.789f); + oFeatSrc.SetField("datetime", 2023, 1, 3, 12, 34, 56.789f); + OGRPoint p(1, 2); + oFeatSrc.SetGeometry(&p); + std::vector abyBuffer; + + EXPECT_TRUE(oFeatSrc.SerializeToBinary(abyBuffer)); + + OGRFeature oFeatDst(&oFDefn); + + // Try truncated buffers + for (size_t i = 0; i < abyBuffer.size(); ++i) + { + EXPECT_FALSE( + oFeatDst.DeserializeFromBinary(abyBuffer.data(), i)); + } + + // Try corrupted buffers + { + CPLErrorHandlerPusher oErrorHandler(CPLQuietErrorHandler); + for (size_t i = 0; i < abyBuffer.size(); ++i) + { + // Might succeed or fail, but shouldn't crash.. + const GByte backup = abyBuffer[i]; + abyBuffer[i] = static_cast(~abyBuffer[i]); + (void)oFeatDst.DeserializeFromBinary(abyBuffer.data(), + abyBuffer.size()); + abyBuffer[i] = backup; + } + } + + EXPECT_TRUE(oFeatDst.DeserializeFromBinary(abyBuffer.data(), + abyBuffer.size())); + // oFeatSrc.DumpReadable(stdout); + // oFeatDst.DumpReadable(stdout); + EXPECT_TRUE(oFeatDst.Equal(&oFeatSrc)); + } + } +} + } // namespace diff --git a/autotest/ogr/data/parquet/overture_map_extract.parquet b/autotest/ogr/data/parquet/overture_map_extract.parquet new file mode 100644 index 000000000000..40b63253f4f2 Binary files /dev/null and b/autotest/ogr/data/parquet/overture_map_extract.parquet differ diff --git a/autotest/ogr/ogr_parquet.py b/autotest/ogr/ogr_parquet.py index 424ff7ce6abf..f6b3d909c585 100755 --- a/autotest/ogr/ogr_parquet.py +++ b/autotest/ogr/ogr_parquet.py @@ -615,6 +615,14 @@ def test_ogr_parquet_write_from_another_dataset(use_vsi, row_group_size, fid): assert "geometry" in j["columns"] assert "encoding" in j["columns"]["geometry"] assert j["columns"]["geometry"]["encoding"] == "WKB" + assert j["columns"]["geometry"]["covering"] == { + "bbox": { + "xmax": ["geometry_bbox", "xmax"], + "xmin": ["geometry_bbox", "xmin"], + "ymax": ["geometry_bbox", "ymax"], + "ymin": ["geometry_bbox", "ymin"], + } + } md = lyr.GetMetadata("_PARQUET_METADATA_") assert "geo" in md @@ -2971,22 +2979,16 @@ def test_ogr_parquet_nested_types(): ############################################################################### -# Test GetExtent() using bbox.minx, bbox.miny, bbox.maxx, bbox.maxy fields -# as in Ouverture Maps datasets +# Test float32 bounding box column -def test_ogr_parquet_bbox_minx_miny_maxx_maxy(tmp_vsimem): +def test_ogr_parquet_bbox_float32(tmp_vsimem): - outfilename = str(tmp_vsimem / "test_ogr_parquet_bbox_minx_miny_maxx_maxy.parquet") + outfilename = str(tmp_vsimem / "test_ogr_parquet_bbox_float32.parquet") ds = ogr.GetDriverByName("Parquet").CreateDataSource(outfilename) lyr = ds.CreateLayer( - "test", geom_type=ogr.wkbNone, options=["FID=fid", "ROW_GROUP_SIZE=2"] + "test", geom_type=ogr.wkbPolygon, options=["FID=fid", "ROW_GROUP_SIZE=2"] ) - lyr.CreateField(ogr.FieldDefn("bbox.minx", ogr.OFTReal)) - lyr.CreateField(ogr.FieldDefn("bbox.miny", ogr.OFTReal)) - lyr.CreateField(ogr.FieldDefn("bbox.maxx", ogr.OFTReal)) - lyr.CreateField(ogr.FieldDefn("bbox.maxy", ogr.OFTReal)) - lyr.CreateField(ogr.FieldDefn("geometry", ogr.OFTBinary)) fid = 0 for wkt in ["LINESTRING(1 2,3 4)", None, "LINESTRING(-1 0,1 10)"]: f = ogr.Feature(lyr.GetLayerDefn()) @@ -2994,16 +2996,42 @@ def test_ogr_parquet_bbox_minx_miny_maxx_maxy(tmp_vsimem): fid += 1 if wkt: g = ogr.CreateGeometryFromWkt(wkt) - minx, maxx, miny, maxy = g.GetEnvelope() - f["bbox.minx"] = minx - f["bbox.miny"] = miny - f["bbox.maxx"] = maxx - f["bbox.maxy"] = maxy - wkb = g.ExportToIsoWkb() - f.SetFieldBinaryFromHexString("geometry", "".join("%02X" % x for x in wkb)) + f.SetGeometryDirectly(g) lyr.CreateFeature(f) ds = None + def check_file(filename): + with gdaltest.config_option("OGR_PARQUET_USE_BBOX", "NO"): + ds = ogr.Open(filename) + lyr = ds.GetLayer(0) + assert lyr.TestCapability(ogr.OLCFastGetExtent) == 0 + minx, maxx, miny, maxy = lyr.GetExtent() + assert (minx, miny, maxx, maxy) == (-1.0, 0.0, 3.0, 10.0) + f = lyr.GetNextFeature() + assert f["geometry_bbox.xmin"] == 1 + assert f["geometry_bbox.ymin"] == 2 + assert f["geometry_bbox.xmax"] == 3 + assert f["geometry_bbox.ymax"] == 4 + f = lyr.GetNextFeature() + assert f["geometry_bbox.xmin"] is None + assert f["geometry_bbox.ymin"] is None + assert f["geometry_bbox.xmax"] is None + assert f["geometry_bbox.ymax"] is None + f = lyr.GetNextFeature() + assert f["geometry_bbox.xmin"] == -1 + assert f["geometry_bbox.ymin"] == 0 + assert f["geometry_bbox.xmax"] == 1 + assert f["geometry_bbox.ymax"] == 10 + ds = None + + check_file(outfilename) + + # Check that re-creating the bounding box column works with the Arrow + # interface + outfilename2 = str(tmp_vsimem / "test_ogr_parquet_bbox_float32_copy.parquet") + gdal.VectorTranslate(outfilename2, outfilename) + check_file(outfilename2) + ds = ogr.Open(outfilename) lyr = ds.GetLayer(0) assert lyr.GetGeometryColumn() == "geometry" @@ -3016,14 +3044,6 @@ def test_ogr_parquet_bbox_minx_miny_maxx_maxy(tmp_vsimem): assert f.GetFID() == 0 assert lyr.GetNextFeature() is None - for field_name in ("bbox.minx", "bbox.miny", "bbox.maxx", "bbox.maxy"): - lyr.SetIgnoredFields([field_name]) - with ogrtest.spatial_filter(lyr, 1, 2, 1, 2): - f = lyr.GetNextFeature() - assert f.GetFID() == 0 - assert lyr.GetNextFeature() is None - lyr.SetIgnoredFields([]) - # Test dfGroupMaxY < m_sFilterEnvelope.MinY with ogrtest.spatial_filter(lyr, 1, 10, 1, 10): f = lyr.GetNextFeature() @@ -3054,16 +3074,47 @@ def test_ogr_parquet_bbox_minx_miny_maxx_maxy(tmp_vsimem): f = lyr.GetNextFeature() assert f.GetFID() == 0 assert lyr.GetNextFeature() is None + ds = None + + +############################################################################### +# Test GetExtent() using bbox.minx, bbox.miny, bbox.maxx, bbox.maxy fields +# as in Ouverture Maps datasets + + +def test_ogr_parquet_bbox_double(): + + ds = ogr.Open("data/parquet/overture_map_extract.parquet") + lyr = ds.GetLayer(0) + assert lyr.GetGeometryColumn() == "geometry" + assert lyr.TestCapability(ogr.OLCFastGetExtent) == 1 + minx, maxx, miny, maxy = lyr.GetExtent() + assert (minx, miny, maxx, maxy) == pytest.approx( + (-36.831345, -10.049401, -36.831238, -10.049268) + ) + + with ogrtest.spatial_filter( + lyr, + minx + (maxx - minx) / 2, + miny + (maxy - miny) / 2, + maxx - (maxx - minx) / 2, + maxy - (maxy - miny) / 2, + ): + f = lyr.GetNextFeature() + assert f.GetFID() == 0 + assert lyr.GetNextFeature() is None ds = None with gdaltest.config_option("OGR_PARQUET_USE_BBOX", "NO"): - ds = ogr.Open(outfilename) + ds = ogr.Open("data/parquet/overture_map_extract.parquet") lyr = ds.GetLayer(0) assert lyr.GetGeometryColumn() == "geometry" assert lyr.TestCapability(ogr.OLCFastGetExtent) == 0 minx, maxx, miny, maxy = lyr.GetExtent() - assert (minx, miny, maxx, maxy) == (-1.0, 0.0, 3.0, 10.0) + assert (minx, miny, maxx, maxy) == pytest.approx( + (-36.831345, -10.049401, -36.831238, -10.049268) + ) ds = None @@ -3332,3 +3383,103 @@ def test_ogr_parquet_get_extent_3d(tmp_vsimem): lyr = ds.GetLayer(0) assert lyr.TestCapability(ogr.OLCFastGetExtent3D) == 0 assert lyr.GetExtent3D() == (1.0, 4.0, 2.0, 5.0, 3.0, 6.0) + + +############################################################################### + + +@gdaltest.enable_exceptions() +@pytest.mark.require_driver("GPKG") +def test_ogr_parquet_sort_by_bbox(tmp_vsimem): + + outfilename = str(tmp_vsimem / "test_ogr_parquet_sort_by_bbox.parquet") + ds = ogr.GetDriverByName("Parquet").CreateDataSource(outfilename) + + gpkg_drv = gdal.GetDriverByName("GPKG") + gpkg_drv.Deregister() + ROW_GROUP_SIZE = 100 + try: + with pytest.raises( + Exception, + match="Driver GPKG required for SORT_BY_BBOX layer creation option", + ): + ds.CreateLayer( + "test", + geom_type=ogr.wkbPoint, + options=["SORT_BY_BBOX=YES", f"ROW_GROUP_SIZE={ROW_GROUP_SIZE}"], + ) + finally: + gpkg_drv.Register() + + lyr = ds.CreateLayer( + "test", + geom_type=ogr.wkbPoint, + options=["SORT_BY_BBOX=YES", f"ROW_GROUP_SIZE={ROW_GROUP_SIZE}", "FID=fid"], + ) + assert lyr.TestCapability(ogr.OLCFastWriteArrowBatch) == 0 + lyr.CreateField(ogr.FieldDefn("i", ogr.OFTInteger)) + COUNT_NON_SPATIAL = 501 + COUNT_SPATIAL = 601 + for i in range(COUNT_NON_SPATIAL): + f = ogr.Feature(lyr.GetLayerDefn()) + f["i"] = i + lyr.CreateFeature(f) + for i in range(COUNT_SPATIAL): + f = ogr.Feature(lyr.GetLayerDefn()) + f["i"] = i + COUNT_NON_SPATIAL + f.SetGeometryDirectly(ogr.CreateGeometryFromWkt(f"POINT({i} {i})")) + lyr.CreateFeature(f) + ds = None + + with gdaltest.config_option("OGR_PARQUET_SHOW_ROW_GROUP_EXTENT", "YES"): + ds = ogr.Open(outfilename) + lyr = ds.GetLayer(0) + theorical_number_of_groups = ( + COUNT_SPATIAL + ROW_GROUP_SIZE - 1 + ) // ROW_GROUP_SIZE + assert lyr.GetFeatureCount() - theorical_number_of_groups <= max( + 1, 0.3 * theorical_number_of_groups + ) + assert sum([f["feature_count"] for f in lyr]) == COUNT_SPATIAL + + def check_file(filename): + ds = ogr.Open(filename) + lyr = ds.GetLayer(0) + + # First features should be non spatial ones + for i in range(COUNT_NON_SPATIAL): + f = lyr.GetNextFeature() + assert f.GetFID() == i + assert f["i"] == i + assert f.GetGeometryRef() is None + + # Now spatial features + count = 0 + foundNonSequential = False + set_i = set() + while True: + f = lyr.GetNextFeature() + if not f: + break + assert f["i"] >= COUNT_NON_SPATIAL + if f["i"] != i + COUNT_NON_SPATIAL: + foundNonSequential = True + assert f["i"] not in set_i + set_i.add(f["i"]) + assert f.GetFID() == f["i"] + assert f.GetGeometryRef().GetX() == f["i"] - COUNT_NON_SPATIAL + count += 1 + + assert count == COUNT_SPATIAL + assert foundNonSequential + + check_file(outfilename) + + # Check that this works also when using the Arrow interface for creation + outfilename2 = str(tmp_vsimem / "test_ogr_parquet_sort_by_bbox2.parquet") + gdal.VectorTranslate( + outfilename2, + outfilename, + layerCreationOptions=["SORT_BY_BBOX=YES", "ROW_GROUP_SIZE=100"], + ) + check_file(outfilename2) diff --git a/doc/source/drivers/vector/parquet.rst b/doc/source/drivers/vector/parquet.rst index 2f73d011306e..687d83f1ddf4 100644 --- a/doc/source/drivers/vector/parquet.rst +++ b/doc/source/drivers/vector/parquet.rst @@ -115,6 +115,40 @@ Layer creation options Name of creating application. +- .. lco:: WRITE_COVERING_BBOX + :choices: YES, NO + :default: YES + :since: 3.9 + + Whether to write xmin/ymin/xmax/ymax columns with the bounding box of + geometries. + +- .. lco:: SORT_BY_BBOX + :choices: YES, NO + :default: NO + :since: 3.9 + + Whether features should be sorted based on the bounding box of their + geometries, before being written in the final file. Sorting them enables + faster spatial filtering on reading, by grouping together spatially close + features in the same group of rows. + + Note however that enabling this option involves creating a temporary + GeoPackage file (in the same directory as the final Parquet file), + and thus requires temporary storage (possibly up to several times the size + of the final Parquet file, depending on Parquet compression) and additional + processing time. + + The efficiency of spatial filtering depends on the ROW_GROUP_SIZE. If it + is too large, too many features that are not spatially close will be grouped + together. If it is too small, the file size will increase, and extra + processing time will be necessary to browse through the row groups. + + Note also that when this option is enabled, the Arrow writing API (which + is for example triggered when using ogr2ogr to convert from Parquet to Parquet), + fallbacks to the generic implementation, which does not support advanced + Arrow types (lists, maps, etc.). + SQL support ----------- diff --git a/ogr/ogr_feature.h b/ogr/ogr_feature.h index e86e1e6c3b49..40d0c74bfa48 100644 --- a/ogr/ogr_feature.h +++ b/ogr/ogr_feature.h @@ -1412,6 +1412,9 @@ class CPL_DLL OGRFeature int Validate(int nValidateFlags, int bEmitError) const; void FillUnsetWithDefault(int bNotNullableOnly, char **papszOptions); + bool SerializeToBinary(std::vector &abyBuffer) const; + bool DeserializeFromBinary(const GByte *pabyBuffer, size_t nSize); + virtual const char *GetStyleString() const; virtual void SetStyleString(const char *); virtual void SetStyleStringDirectly(char *); diff --git a/ogr/ogrfeature.cpp b/ogr/ogrfeature.cpp index 66cdd49ef13f..4274900f6431 100644 --- a/ogr/ogrfeature.cpp +++ b/ogr/ogrfeature.cpp @@ -7475,6 +7475,740 @@ void OGRFeatureUniquePtrDeleter::operator()(OGRFeature *poFeature) const } //! @endcond +namespace +{ +// Implementation borrowed to OpenFileGDB + +/************************************************************************/ +/* WriteUInt8() */ +/************************************************************************/ + +inline void WriteUInt8(std::vector &abyBuffer, uint8_t nVal) +{ + abyBuffer.push_back(nVal); +} + +/************************************************************************/ +/* WriteVarUInt() */ +/************************************************************************/ + +inline void WriteVarUInt(std::vector &abyBuffer, uint64_t nVal) +{ + while (true) + { + if (nVal >= 0x80) + { + WriteUInt8(abyBuffer, static_cast(0x80 | (nVal & 0x7F))); + nVal >>= 7; + } + else + { + WriteUInt8(abyBuffer, static_cast(nVal)); + break; + } + } +} + +/************************************************************************/ +/* WriteVarInt() */ +/************************************************************************/ + +inline void WriteVarInt(std::vector &abyBuffer, int64_t nVal) +{ + uint64_t nUVal; + if (nVal < 0) + { + if (nVal == std::numeric_limits::min()) + nUVal = static_cast(1) << 63; + else + nUVal = -nVal; + if (nUVal >= 0x40) + { + WriteUInt8(abyBuffer, + static_cast(0x80 | 0x40 | (nUVal & 0x3F))); + nUVal >>= 6; + } + else + { + WriteUInt8(abyBuffer, static_cast(0x40 | (nUVal & 0x3F))); + return; + } + } + else + { + nUVal = nVal; + if (nUVal >= 0x40) + { + WriteUInt8(abyBuffer, static_cast(0x80 | (nUVal & 0x3F))); + nUVal >>= 6; + } + else + { + WriteUInt8(abyBuffer, static_cast((nUVal & 0x3F))); + return; + } + } + + WriteVarUInt(abyBuffer, nUVal); +} + +/************************************************************************/ +/* WriteFloat32() */ +/************************************************************************/ + +inline void WriteFloat32(std::vector &abyBuffer, float fVal) +{ + CPL_LSBPTR32(&fVal); + const GByte *pabyInput = reinterpret_cast(&fVal); + abyBuffer.insert(abyBuffer.end(), pabyInput, pabyInput + sizeof(fVal)); +} + +/************************************************************************/ +/* WriteFloat64() */ +/************************************************************************/ + +inline void WriteFloat64(std::vector &abyBuffer, double dfVal) +{ + CPL_LSBPTR64(&dfVal); + const GByte *pabyInput = reinterpret_cast(&dfVal); + abyBuffer.insert(abyBuffer.end(), pabyInput, pabyInput + sizeof(dfVal)); +} + +} // namespace + +/************************************************************************/ +/* OGRFeature::SerializeToBinary() */ +/************************************************************************/ + +/** Serialize the feature to a binary encoding. + * + * This saves the feature ID, attribute fields content and geometry fields + * content. + * + * This method is aimed at being paired with DeserializeFromBinary(). + * + * The format of that encoding may vary across GDAL versions. + * + * Note that abyBuffer is cleared at the beginning of this function. + * + * @since 3.9 + */ +bool OGRFeature::SerializeToBinary(std::vector &abyBuffer) const +{ + const int nFieldCount = poDefn->GetFieldCount(); + const int nGeomFieldCount = poDefn->GetGeomFieldCount(); + try + { + abyBuffer.clear(); + // Set field flags + // For attribute fields, we have 2 bits + // - first one set if the field is unset + // - second one set if the field is null + // For geometry fields, we have one bit set to indicate if the geometry + // is non-null. + const size_t nPresenceFlagsSize = + ((2 * nFieldCount + nGeomFieldCount) + 7) / 8; + abyBuffer.resize(nPresenceFlagsSize); + + WriteVarInt(abyBuffer, GetFID()); + + const auto SetFlagBit = [&abyBuffer](int iBit) + { abyBuffer[iBit / 8] |= (1 << (iBit % 8)); }; + + for (int i = 0; i < nFieldCount; ++i) + { + const OGRField &uField = pauFields[i]; + if (OGR_RawField_IsUnset(&uField)) + { + const int iBit = 2 * i; + SetFlagBit(iBit); + continue; + } + if (OGR_RawField_IsNull(&uField)) + { + const int iBit = 2 * i + 1; + SetFlagBit(iBit); + continue; + } + const auto poFDefn = poDefn->GetFieldDefn(i); + switch (poFDefn->GetType()) + { + case OFTInteger: + { + WriteVarInt(abyBuffer, uField.Integer); + break; + } + case OFTInteger64: + { + WriteVarInt(abyBuffer, uField.Integer64); + break; + } + case OFTReal: + { + WriteFloat64(abyBuffer, uField.Real); + break; + } + case OFTString: + { + const size_t nStrSize = strlen(uField.String); + WriteVarUInt(abyBuffer, nStrSize); + const GByte *pabyStr = + reinterpret_cast(uField.String); + abyBuffer.insert(abyBuffer.end(), pabyStr, + pabyStr + nStrSize); + break; + } + case OFTIntegerList: + { + WriteVarInt(abyBuffer, uField.IntegerList.nCount); + for (int j = 0; j < uField.IntegerList.nCount; ++j) + WriteVarInt(abyBuffer, uField.IntegerList.paList[j]); + break; + } + case OFTInteger64List: + { + WriteVarInt(abyBuffer, uField.Integer64List.nCount); + for (int j = 0; j < uField.Integer64List.nCount; ++j) + WriteVarInt(abyBuffer, uField.Integer64List.paList[j]); + break; + } + case OFTRealList: + { + WriteVarInt(abyBuffer, uField.RealList.nCount); + for (int j = 0; j < uField.RealList.nCount; ++j) + WriteFloat64(abyBuffer, uField.RealList.paList[j]); + break; + } + case OFTStringList: + { + WriteVarInt(abyBuffer, uField.StringList.nCount); + for (int j = 0; j < uField.StringList.nCount; ++j) + { + const char *pszStr = uField.StringList.paList[j]; + const size_t nStrSize = strlen(pszStr); + WriteVarUInt(abyBuffer, nStrSize); + const GByte *pabyStr = + reinterpret_cast(pszStr); + abyBuffer.insert(abyBuffer.end(), pabyStr, + pabyStr + nStrSize); + } + break; + } + case OFTBinary: + { + WriteVarInt(abyBuffer, uField.Binary.nCount); + abyBuffer.insert(abyBuffer.end(), uField.Binary.paData, + uField.Binary.paData + + uField.Binary.nCount); + break; + } + case OFTWideString: + case OFTWideStringList: + break; + case OFTDate: + { + WriteVarInt(abyBuffer, uField.Date.Year); + WriteUInt8(abyBuffer, uField.Date.Month); + WriteUInt8(abyBuffer, uField.Date.Day); + break; + } + case OFTTime: + { + WriteUInt8(abyBuffer, uField.Date.Hour); + WriteUInt8(abyBuffer, uField.Date.Minute); + WriteFloat32(abyBuffer, uField.Date.Second); + WriteUInt8(abyBuffer, uField.Date.TZFlag); + break; + } + case OFTDateTime: + { + WriteVarInt(abyBuffer, uField.Date.Year); + WriteUInt8(abyBuffer, uField.Date.Month); + WriteUInt8(abyBuffer, uField.Date.Day); + WriteUInt8(abyBuffer, uField.Date.Hour); + WriteUInt8(abyBuffer, uField.Date.Minute); + WriteFloat32(abyBuffer, uField.Date.Second); + WriteUInt8(abyBuffer, uField.Date.TZFlag); + break; + } + } + } + for (int i = 0; i < nGeomFieldCount; ++i) + { + if (!papoGeometries[i]) + { + const int iBit = 2 * nFieldCount + i; + SetFlagBit(iBit); + continue; + } + const size_t nSize = papoGeometries[i]->WkbSize(); + WriteVarUInt(abyBuffer, nSize); + const size_t nBufSizeBefore = abyBuffer.size(); + abyBuffer.resize(nBufSizeBefore + nSize); + papoGeometries[i]->exportToWkb( + wkbNDR, abyBuffer.data() + nBufSizeBefore, wkbVariantIso); + } + return true; + } + catch (const std::bad_alloc &) + { + CPLError(CE_Failure, CPLE_OutOfMemory, "Out of memory"); + return false; + } +} + +namespace +{ +// Implementation borrowed to OpenFileGDB + +/************************************************************************/ +/* ReadVarUInt() */ +/************************************************************************/ + +template +static bool ReadVarUInt(const GByte *&pabyIter, const GByte *pabyEnd, + OutType &nOutVal) +{ + if (pabyIter >= pabyEnd) + return false; + OutType b = *pabyIter; + if ((b & 0x80) == 0) + { + pabyIter++; + nOutVal = b; + return true; + } + const GByte *pabyLocalIter = pabyIter + 1; + int nShift = 7; + OutType nVal = (b & 0x7F); + while (true) + { + if (pabyLocalIter >= pabyEnd) + return false; + b = *pabyLocalIter; + pabyLocalIter++; + nVal |= (b & 0x7F) << nShift; + if ((b & 0x80) == 0) + { + pabyIter = pabyLocalIter; + nOutVal = nVal; + return true; + } + nShift += 7; + // To avoid undefined behavior later when doing << nShift + if (nShift >= static_cast(sizeof(OutType)) * 8) + { + return false; + } + } +} + +/************************************************************************/ +/* ReadVarInt() */ +/************************************************************************/ + +template +CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW static bool +ReadVarInt(const GByte *&pabyIter, const GByte *pabyEnd, OutType &nOutVal) +{ + GUInt32 b; + + if (pabyIter >= pabyEnd) + return false; + b = *pabyIter; + GUIntBig nVal = (b & 0x3F); + bool bNegative = (b & 0x40) != 0; + if ((b & 0x80) == 0) + { + pabyIter++; + if (bNegative) + nOutVal = -static_cast(nVal); + else + nOutVal = static_cast(nVal); + return true; + } + + const GByte *pabyLocalIter = pabyIter + 1; + int nShift = 6; + while (true) + { + if (pabyLocalIter >= pabyEnd) + return false; + GUIntBig b64 = *pabyLocalIter; + pabyLocalIter++; + nVal |= (b64 & 0x7F) << nShift; + if ((b64 & 0x80) == 0) + { + pabyIter = pabyLocalIter; + if (bNegative) + nOutVal = -static_cast(nVal); + else + nOutVal = static_cast(nVal); + return true; + } + nShift += 7; + // To avoid undefined behavior later when doing << nShift + if (nShift >= static_cast(sizeof(GIntBig)) * 8) + { + return false; + } + } +} + +/************************************************************************/ +/* ReadUInt8() */ +/************************************************************************/ + +inline bool ReadUInt8(const GByte *&pabyIter, const GByte *pabyEnd, GByte &nVal) +{ + if (pabyIter + sizeof(nVal) > pabyEnd) + return false; + nVal = *pabyIter; + pabyIter += sizeof(nVal); + return true; +} + +/************************************************************************/ +/* ReadFloat32() */ +/************************************************************************/ + +inline bool ReadFloat32(const GByte *&pabyIter, const GByte *pabyEnd, + float &fVal) +{ + if (pabyIter + sizeof(fVal) > pabyEnd) + return false; + memcpy(&fVal, pabyIter, sizeof(fVal)); + CPL_LSBPTR32(&fVal); + pabyIter += sizeof(fVal); + return true; +} + +/************************************************************************/ +/* ReadFloat64() */ +/************************************************************************/ + +inline bool ReadFloat64(const GByte *&pabyIter, const GByte *pabyEnd, + double &dfVal) +{ + if (pabyIter + sizeof(dfVal) > pabyEnd) + return false; + memcpy(&dfVal, pabyIter, sizeof(dfVal)); + CPL_LSBPTR64(&dfVal); + pabyIter += sizeof(dfVal); + return true; +} + +} // namespace + +/************************************************************************/ +/* OGRFeature::DeserializeFromBinary() */ +/************************************************************************/ + +/** Instantiate a feature from a binary encoding produces by SerializeToBinary() + * + * This sets the feature ID, attribute fields content and geometry fields + * content. + * + * DeserializeFromBinary() should be called on a feature whose feature definition + * is exactly the same as the one on which SerializeToBinary() was called. + * (but there is no security issue if not doing that, or if feeding a "random" + * buffer to that method). + * + * The format of that encoding may vary across GDAL versions. + * + * @since 3.9 + */ +bool OGRFeature::DeserializeFromBinary(const GByte *pabyBuffer, size_t nSize) +{ + Reset(); + + const GByte *const pabyFlags = pabyBuffer; + const GByte *const pabyEnd = pabyBuffer + nSize; + const int nFieldCount = poDefn->GetFieldCount(); + const int nGeomFieldCount = poDefn->GetGeomFieldCount(); + const size_t nPresenceFlagsSize = + ((2 * nFieldCount + nGeomFieldCount) + 7) / 8; + if (nSize < nPresenceFlagsSize) + return false; + pabyBuffer += nPresenceFlagsSize; + + if (!ReadVarInt(pabyBuffer, pabyEnd, nFID)) + return false; + + const auto IsFlagBitSet = [pabyFlags](int iBit) -> bool + { return (pabyFlags[iBit / 8] & (1 << (iBit % 8))) != 0; }; + + for (int i = 0; i < nFieldCount; ++i) + { + OGRField &uField = pauFields[i]; + { + const int iBit = 2 * i; + if (IsFlagBitSet(iBit)) + { + // OGR_RawField_SetUnset(&uField); + continue; + } + } + { + const int iBit = 2 * i + 1; + if (IsFlagBitSet(iBit)) + { + OGR_RawField_SetNull(&uField); + continue; + } + } + const auto poFDefn = poDefn->GetFieldDefn(i); + switch (poFDefn->GetType()) + { + case OFTInteger: + { + uField.Set.nMarker2 = 0; + uField.Set.nMarker3 = 0; + if (!ReadVarInt(pabyBuffer, pabyEnd, uField.Integer)) + return false; + break; + } + case OFTInteger64: + { + uField.Set.nMarker2 = 0; + uField.Set.nMarker3 = 0; + if (!ReadVarInt(pabyBuffer, pabyEnd, uField.Integer64)) + return false; + break; + } + case OFTReal: + { + uField.Set.nMarker2 = 0; + uField.Set.nMarker3 = 0; + if (!ReadFloat64(pabyBuffer, pabyEnd, uField.Real)) + return false; + break; + } + case OFTString: + { + size_t nStrSize = 0; + if (!ReadVarUInt(pabyBuffer, pabyEnd, nStrSize) || + nStrSize > std::numeric_limits::max() - 1) + { + return false; + } + if (nStrSize > static_cast(pabyEnd - pabyBuffer)) + return false; + auto ptr = + static_cast(VSI_MALLOC_VERBOSE(nStrSize + 1)); + if (!ptr) + return false; + uField.Set.nMarker2 = 0; + uField.Set.nMarker3 = 0; + uField.String = ptr; + memcpy(uField.String, pabyBuffer, nStrSize); + uField.String[nStrSize] = 0; + pabyBuffer += nStrSize; + break; + } + case OFTIntegerList: + { + int nCount = 0; + if (!ReadVarInt(pabyBuffer, pabyEnd, nCount) || nCount < 0 || + nCount > pabyEnd - pabyBuffer) + { + return false; + } + auto ptr = static_cast( + VSI_MALLOC2_VERBOSE(nCount, sizeof(int))); + if (!ptr) + return false; + uField.Set.nMarker2 = 0; + uField.Set.nMarker3 = 0; + uField.IntegerList.paList = ptr; + uField.IntegerList.nCount = nCount; + for (int j = 0; j < nCount; ++j) + { + if (!ReadVarInt(pabyBuffer, pabyEnd, + uField.IntegerList.paList[j])) + return false; + } + break; + } + case OFTInteger64List: + { + int nCount = 0; + if (!ReadVarInt(pabyBuffer, pabyEnd, nCount) || nCount < 0 || + nCount > pabyEnd - pabyBuffer) + { + return false; + } + auto ptr = static_cast( + VSI_MALLOC2_VERBOSE(nCount, sizeof(GIntBig))); + if (!ptr) + return false; + uField.Set.nMarker2 = 0; + uField.Set.nMarker3 = 0; + uField.Integer64List.paList = ptr; + uField.Integer64List.nCount = nCount; + for (int j = 0; j < nCount; ++j) + { + if (!ReadVarInt(pabyBuffer, pabyEnd, + uField.Integer64List.paList[j])) + return false; + } + break; + } + case OFTRealList: + { + int nCount = 0; + if (!ReadVarInt(pabyBuffer, pabyEnd, nCount) || nCount < 0 || + nCount > pabyEnd - pabyBuffer) + { + return false; + } + auto ptr = static_cast( + VSI_MALLOC2_VERBOSE(nCount, sizeof(double))); + if (!ptr) + return false; + uField.Set.nMarker2 = 0; + uField.Set.nMarker3 = 0; + uField.RealList.paList = ptr; + uField.RealList.nCount = nCount; + for (int j = 0; j < nCount; ++j) + { + if (!ReadFloat64(pabyBuffer, pabyEnd, + uField.RealList.paList[j])) + return false; + } + break; + } + case OFTStringList: + { + int nCount = 0; + if (!ReadVarInt(pabyBuffer, pabyEnd, nCount) || nCount < 0 || + nCount > std::numeric_limits::max() - 1 || + nCount > pabyEnd - pabyBuffer) + { + return false; + } + auto ptr = static_cast( + VSI_CALLOC_VERBOSE(nCount + 1, sizeof(char *))); + if (!ptr) + return false; + uField.Set.nMarker2 = 0; + uField.Set.nMarker3 = 0; + uField.StringList.paList = ptr; + uField.StringList.nCount = nCount; + for (int j = 0; j < nCount; ++j) + { + size_t nStrSize = 0; + if (!ReadVarUInt(pabyBuffer, pabyEnd, nStrSize) || + nStrSize > std::numeric_limits::max() - 1) + { + return false; + } + if (nStrSize > static_cast(pabyEnd - pabyBuffer)) + return false; + uField.StringList.paList[j] = + static_cast(VSI_MALLOC_VERBOSE(nStrSize + 1)); + if (!uField.StringList.paList[j]) + return false; + memcpy(uField.StringList.paList[j], pabyBuffer, nStrSize); + uField.StringList.paList[j][nStrSize] = 0; + pabyBuffer += nStrSize; + } + break; + } + case OFTBinary: + { + int nBinSize = 0; + if (!ReadVarInt(pabyBuffer, pabyEnd, nBinSize) || nBinSize < 0) + { + return false; + } + if (nBinSize > pabyEnd - pabyBuffer) + return false; + auto ptr = static_cast(VSI_MALLOC_VERBOSE(nBinSize)); + if (!ptr) + return false; + uField.Set.nMarker2 = 0; + uField.Set.nMarker3 = 0; + uField.Binary.paData = ptr; + uField.Binary.nCount = nBinSize; + memcpy(uField.Binary.paData, pabyBuffer, nBinSize); + pabyBuffer += nBinSize; + break; + } + case OFTWideString: + case OFTWideStringList: + break; + case OFTDate: + { + memset(&uField, 0, sizeof(uField)); + if (!ReadVarInt(pabyBuffer, pabyEnd, uField.Date.Year) || + !ReadUInt8(pabyBuffer, pabyEnd, uField.Date.Month) || + !ReadUInt8(pabyBuffer, pabyEnd, uField.Date.Day)) + { + return false; + } + break; + } + case OFTTime: + { + memset(&uField, 0, sizeof(uField)); + if (!ReadUInt8(pabyBuffer, pabyEnd, uField.Date.Hour) || + !ReadUInt8(pabyBuffer, pabyEnd, uField.Date.Minute) || + !ReadFloat32(pabyBuffer, pabyEnd, uField.Date.Second) || + !ReadUInt8(pabyBuffer, pabyEnd, uField.Date.TZFlag)) + { + return false; + } + break; + } + case OFTDateTime: + { + memset(&uField, 0, sizeof(uField)); + if (!ReadVarInt(pabyBuffer, pabyEnd, uField.Date.Year) || + !ReadUInt8(pabyBuffer, pabyEnd, uField.Date.Month) || + !ReadUInt8(pabyBuffer, pabyEnd, uField.Date.Day) || + !ReadUInt8(pabyBuffer, pabyEnd, uField.Date.Hour) || + !ReadUInt8(pabyBuffer, pabyEnd, uField.Date.Minute) || + !ReadFloat32(pabyBuffer, pabyEnd, uField.Date.Second) || + !ReadUInt8(pabyBuffer, pabyEnd, uField.Date.TZFlag)) + { + return false; + } + break; + } + } + } + for (int i = 0; i < nGeomFieldCount; ++i) + { + const int iBit = 2 * nFieldCount + i; + if (IsFlagBitSet(iBit)) + { + continue; + } + size_t nWkbSize = 0; + if (!ReadVarUInt(pabyBuffer, pabyEnd, nWkbSize)) + { + return false; + } + if (nWkbSize > static_cast(pabyEnd - pabyBuffer)) + { + return false; + } + OGRGeometry *poGeom = nullptr; + if (OGRGeometryFactory::createFromWkb( + pabyBuffer, poDefn->GetGeomFieldDefn(i)->GetSpatialRef(), + &poGeom, nWkbSize, wkbVariantIso) != OGRERR_NONE || + !poGeom) + { + delete poGeom; + return false; + } + pabyBuffer += nWkbSize; + papoGeometries[i] = poGeom; + } + return true; +} + /************************************************************************/ /* OGRFeature::ConstFieldIterator */ /************************************************************************/ diff --git a/ogr/ogrsf_frmts/arrow/ogr_feather.h b/ogr/ogrsf_frmts/arrow/ogr_feather.h index 59bc6b51df16..d02c73563d97 100644 --- a/ogr/ogrsf_frmts/arrow/ogr_feather.h +++ b/ogr/ogrsf_frmts/arrow/ogr_feather.h @@ -165,7 +165,7 @@ class OGRFeatherWriterLayer final : public OGRArrowWriterLayer return m_poFileWriter != nullptr; } virtual void CreateWriter() override; - virtual void CloseFileWriter() override; + virtual bool CloseFileWriter() override; virtual void CreateSchema() override; virtual void PerformStepsBeforeFinalFlushGroup() override; diff --git a/ogr/ogrsf_frmts/arrow/ogrfeatherwriterlayer.cpp b/ogr/ogrsf_frmts/arrow/ogrfeatherwriterlayer.cpp index 491763e7178f..c9fa9c7f8057 100644 --- a/ogr/ogrsf_frmts/arrow/ogrfeatherwriterlayer.cpp +++ b/ogr/ogrsf_frmts/arrow/ogrfeatherwriterlayer.cpp @@ -201,7 +201,7 @@ bool OGRFeatherWriterLayer::SetOptions(const std::string &osFilename, /* CloseFileWriter() */ /************************************************************************/ -void OGRFeatherWriterLayer::CloseFileWriter() +bool OGRFeatherWriterLayer::CloseFileWriter() { auto status = m_poFileWriter->Close(); if (!status.ok()) @@ -210,6 +210,7 @@ void OGRFeatherWriterLayer::CloseFileWriter() "FileWriter::Close() failed with %s", status.message().c_str()); } + return status.ok(); } /************************************************************************/ @@ -457,7 +458,7 @@ bool OGRFeatherWriterLayer::FlushGroup() } } - m_apoBuilders.clear(); + ClearArrayBuilers(); return ret; } diff --git a/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h b/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h index 9b31c151206c..ea8375b9da71 100644 --- a/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h +++ b/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h @@ -110,19 +110,34 @@ class OGRArrowLayer CPL_NON_FINAL std::vector m_anMapGeomFieldIndexToArrowColumn{}; std::vector m_aeGeomEncoding{}; - // OGR field indexes for bbox.minx/miny/maxx/maxy Real fields - int m_iBBOXMinXField = -1; - int m_iBBOXMinYField = -1; - int m_iBBOXMaxXField = -1; - int m_iBBOXMaxYField = -1; + //! Describe the bbox column of a geometry column + struct GeomColBBOX + { + bool bIsFloat = false; + int iArrowCol = -1; + int iArrayIdx = -1; // only valid when m_bIgnoredFields == true + int iArrowSubfieldXMin = -1; + int iArrowSubfieldYMin = -1; + int iArrowSubfieldXMax = -1; + int iArrowSubfieldYMax = -1; + }; + //! Map from OGR geometry field index to GeomColBBOX + std::map m_oMapGeomFieldIndexToGeomColBBOX{}; const arrow::BinaryArray *m_poArrayWKB = nullptr; const arrow::LargeBinaryArray *m_poArrayWKBLarge = nullptr; const arrow::Array *m_poArrayBBOX = nullptr; - const arrow::DoubleArray *m_poArrayMinX = nullptr; - const arrow::DoubleArray *m_poArrayMinY = nullptr; - const arrow::DoubleArray *m_poArrayMaxX = nullptr; - const arrow::DoubleArray *m_poArrayMaxY = nullptr; + const arrow::DoubleArray *m_poArrayXMinDouble = nullptr; + const arrow::DoubleArray *m_poArrayYMinDouble = nullptr; + const arrow::DoubleArray *m_poArrayXMaxDouble = nullptr; + const arrow::DoubleArray *m_poArrayYMaxDouble = nullptr; + const arrow::FloatArray *m_poArrayXMinFloat = nullptr; + const arrow::FloatArray *m_poArrayYMinFloat = nullptr; + const arrow::FloatArray *m_poArrayXMaxFloat = nullptr; + const arrow::FloatArray *m_poArrayYMaxFloat = nullptr; + + //! References values in range [0, m_poSchema->field_count()-1] + std::set m_oSetBBoxArrowColumns{}; bool m_bIgnoredFields = false; std::vector @@ -319,6 +334,25 @@ class OGRArrowWriterLayer CPL_NON_FINAL : public OGRLayer std::vector m_aeGeomEncoding{}; int m_nWKTCoordinatePrecision = -1; + //! Whether to use a struct field with the values of the bounding box + // of the geometries. Used by Parquet. + bool m_bWriteBBoxStruct = false; + + //! Schema fields for bounding box of geometry columns. + // Constraint: if not empty, m_apoFieldsBBOX.size() == m_poFeatureDefn->GetGeomFieldCount() + std::vector> m_apoFieldsBBOX{}; + + //! Array builers for bounding box of geometry columns. + // m_apoBuildersBBOXStruct is for the top-level field of type struct. + // m_apoBuildersBBOX{XMin|YMin|XMax|YMax} are for the floating-point values + // Constraint: if not empty, m_apoBuildersBBOX{Struct|XMin|YMin|XMax|YMax}.size() == m_poFeatureDefn->GetGeomFieldCount() + std::vector> + m_apoBuildersBBOXStruct{}; + std::vector> m_apoBuildersBBOXXMin{}; + std::vector> m_apoBuildersBBOXYMin{}; + std::vector> m_apoBuildersBBOXXMax{}; + std::vector> m_apoBuildersBBOXYMax{}; + std::string m_osFIDColumn{}; int64_t m_nFeatureCount = 0; @@ -347,7 +381,7 @@ class OGRArrowWriterLayer CPL_NON_FINAL : public OGRLayer virtual bool IsFileWriterCreated() const = 0; virtual void CreateWriter() = 0; - virtual void CloseFileWriter() = 0; + virtual bool CloseFileWriter() = 0; void CreateSchemaCommon(); void FinalizeSchema(); @@ -357,8 +391,12 @@ class OGRArrowWriterLayer CPL_NON_FINAL : public OGRLayer } void CreateArrayBuilders(); + + //! Clear array builders + void ClearArrayBuilers(); + virtual bool FlushGroup() = 0; - void FinalizeWriting(); + bool FinalizeWriting(); bool WriteArrays(std::function &, const std::shared_ptr &)> postProcessArray); @@ -430,6 +468,8 @@ class OGRArrowWriterLayer CPL_NON_FINAL : public OGRLayer protected: OGRErr ICreateFeature(OGRFeature *poFeature) override; + + bool FlushFeatures(); }; #endif // OGR_ARROW_H diff --git a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp index e3c211f442dd..b426508606c9 100644 --- a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp +++ b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp @@ -570,17 +570,6 @@ inline bool OGRArrowLayer::MapArrowTypeToOGR( } oField.SetSubType(eSubType); oField.SetNullable(field->nullable()); - if (type->id() == arrow::Type::DOUBLE) - { - if (field->name() == "bbox.minx") - m_iBBOXMinXField = m_poFeatureDefn->GetFieldCount(); - else if (field->name() == "bbox.miny") - m_iBBOXMinYField = m_poFeatureDefn->GetFieldCount(); - else if (field->name() == "bbox.maxx") - m_iBBOXMaxXField = m_poFeatureDefn->GetFieldCount(); - else if (field->name() == "bbox.maxy") - m_iBBOXMaxYField = m_poFeatureDefn->GetFieldCount(); - } m_poFeatureDefn->AddFieldDefn(&oField); m_anMapFieldIndexToArrowColumn.push_back(path); } @@ -3335,10 +3324,14 @@ OGRArrowLayer::SetBatch(const std::shared_ptr &poBatch) m_poArrayWKB = nullptr; m_poArrayWKBLarge = nullptr; m_poArrayBBOX = nullptr; - m_poArrayMinX = nullptr; - m_poArrayMinY = nullptr; - m_poArrayMaxX = nullptr; - m_poArrayMaxY = nullptr; + m_poArrayXMinDouble = nullptr; + m_poArrayYMinDouble = nullptr; + m_poArrayXMaxDouble = nullptr; + m_poArrayYMaxDouble = nullptr; + m_poArrayXMinFloat = nullptr; + m_poArrayYMinFloat = nullptr; + m_poArrayXMaxFloat = nullptr; + m_poArrayYMaxFloat = nullptr; if (m_poBatch) m_poBatchColumns = m_poBatch->columns(); @@ -3367,83 +3360,74 @@ OGRArrowLayer::SetBatch(const std::shared_ptr &poBatch) m_poArrayWKBLarge = static_cast(poArrayWKB); } + } - if (m_iBBOXMinXField >= 0 && m_iBBOXMinYField >= 0 && - m_iBBOXMaxXField >= 0 && m_iBBOXMaxYField >= 0 && - CPLTestBool(CPLGetConfigOption( - ("OGR_" + GetDriverUCName() + "_USE_BBOX").c_str(), "YES"))) + if (iCol >= 0 && + CPLTestBool(CPLGetConfigOption( + ("OGR_" + GetDriverUCName() + "_USE_BBOX").c_str(), "YES"))) + { + const auto oIter = + m_oMapGeomFieldIndexToGeomColBBOX.find(m_iGeomFieldFilter); + if (oIter != m_oMapGeomFieldIndexToGeomColBBOX.end()) { - const auto GetArray = - [this](int idx, const arrow::Array *&poStructArray) - { - if (m_bIgnoredFields) - { - const int arrayIdx = m_anMapFieldIndexToArrayIndex[idx]; - if (arrayIdx < 0) - return static_cast( - nullptr); - auto array = m_poBatchColumns[arrayIdx].get(); - CPLAssert(array->type_id() == arrow::Type::DOUBLE); - return static_cast(array); - } - else - { - auto array = - m_poBatchColumns[m_anMapFieldIndexToArrowColumn[idx] - [0]] - .get(); - ; - int j = 1; - while (array->type_id() == arrow::Type::STRUCT) - { - if (j == 1) - poStructArray = array; - const auto castArray = - static_cast(array); - const auto &subArrays = castArray->fields(); - CPLAssert(j < - static_cast( - m_anMapFieldIndexToArrowColumn[idx] - .size())); - const int iArrowSubcol = - m_anMapFieldIndexToArrowColumn[idx][j]; - j++; - CPLAssert(iArrowSubcol < - static_cast(subArrays.size())); - array = subArrays[iArrowSubcol].get(); - } - CPLAssert(array->type_id() == arrow::Type::DOUBLE); - return static_cast(array); - } - }; - - const arrow::Array *poStructArrayMinX = nullptr; - const arrow::Array *poStructArrayMinY = nullptr; - const arrow::Array *poStructArrayMaxX = nullptr; - const arrow::Array *poStructArrayMaxY = nullptr; - m_poArrayMinX = GetArray(m_iBBOXMinXField, poStructArrayMinX); - m_poArrayMinY = GetArray(m_iBBOXMinYField, poStructArrayMinY); - m_poArrayMaxX = GetArray(m_iBBOXMaxXField, poStructArrayMaxX); - m_poArrayMaxY = GetArray(m_iBBOXMaxYField, poStructArrayMaxY); - - if (poStructArrayMinX != poStructArrayMinY || - poStructArrayMinX != poStructArrayMaxX || - poStructArrayMinX != poStructArrayMaxY) + const int idx = m_bIgnoredFields ? oIter->second.iArrayIdx + : oIter->second.iArrowCol; + CPLAssert(idx >= 0); + CPLAssert(static_cast(idx) < m_poBatchColumns.size()); + m_poArrayBBOX = m_poBatchColumns[idx].get(); + CPLAssert(m_poArrayBBOX->type_id() == arrow::Type::STRUCT); + const auto castArray = + static_cast(m_poArrayBBOX); + const auto &subArrays = castArray->fields(); + CPLAssert( + static_cast(oIter->second.iArrowSubfieldXMin) < + subArrays.size()); + const auto xminArray = + subArrays[oIter->second.iArrowSubfieldXMin].get(); + CPLAssert( + static_cast(oIter->second.iArrowSubfieldYMin) < + subArrays.size()); + const auto yminArray = + subArrays[oIter->second.iArrowSubfieldYMin].get(); + CPLAssert( + static_cast(oIter->second.iArrowSubfieldXMax) < + subArrays.size()); + const auto xmaxArray = + subArrays[oIter->second.iArrowSubfieldXMax].get(); + CPLAssert( + static_cast(oIter->second.iArrowSubfieldYMax) < + subArrays.size()); + const auto ymaxArray = + subArrays[oIter->second.iArrowSubfieldYMax].get(); + if (oIter->second.bIsFloat) { - m_poArrayBBOX = nullptr; + CPLAssert(xminArray->type_id() == arrow::Type::FLOAT); + m_poArrayXMinFloat = + static_cast(xminArray); + CPLAssert(yminArray->type_id() == arrow::Type::FLOAT); + m_poArrayYMinFloat = + static_cast(yminArray); + CPLAssert(xmaxArray->type_id() == arrow::Type::FLOAT); + m_poArrayXMaxFloat = + static_cast(xmaxArray); + CPLAssert(ymaxArray->type_id() == arrow::Type::FLOAT); + m_poArrayYMaxFloat = + static_cast(ymaxArray); } else { - m_poArrayBBOX = poStructArrayMinX; - } - if (!m_poArrayMinX || !m_poArrayMinY || !m_poArrayMaxX || - !m_poArrayMaxY) - { - m_poArrayBBOX = nullptr; - m_poArrayMinX = nullptr; - m_poArrayMinY = nullptr; - m_poArrayMaxX = nullptr; - m_poArrayMaxY = nullptr; + CPLAssert(xminArray->type_id() == arrow::Type::DOUBLE); + m_poArrayXMinDouble = + static_cast(xminArray); + CPLAssert(yminArray->type_id() == arrow::Type::DOUBLE); + m_poArrayYMinDouble = + static_cast(yminArray); + CPLAssert(xmaxArray->type_id() == arrow::Type::DOUBLE); + m_poArrayXMaxDouble = + static_cast(xmaxArray); + CPLAssert(ymaxArray->type_id() == arrow::Type::DOUBLE); + m_poArrayYMaxDouble = + static_cast(ymaxArray); } } } @@ -3479,6 +3463,51 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() { iCol = m_anMapGeomFieldIndexToArrowColumn[m_iGeomFieldFilter]; } + + OGREnvelope sEnvelopeSkipToNextFeatureDueToBBOX; + const auto SkipToNextFeatureDueToBBOX = + [this, &sEnvelopeSkipToNextFeatureDueToBBOX]() + { + if (!m_poArrayBBOX || !m_poArrayBBOX->IsNull(m_nIdxInBatch)) + { + if (m_poArrayXMinFloat && + !m_poArrayXMinFloat->IsNull(m_nIdxInBatch)) + { + sEnvelopeSkipToNextFeatureDueToBBOX.MinX = + m_poArrayXMinFloat->Value(m_nIdxInBatch); + sEnvelopeSkipToNextFeatureDueToBBOX.MinY = + m_poArrayYMinFloat->Value(m_nIdxInBatch); + sEnvelopeSkipToNextFeatureDueToBBOX.MaxX = + m_poArrayXMaxFloat->Value(m_nIdxInBatch); + sEnvelopeSkipToNextFeatureDueToBBOX.MaxY = + m_poArrayYMaxFloat->Value(m_nIdxInBatch); + if (!m_sFilterEnvelope.Intersects( + sEnvelopeSkipToNextFeatureDueToBBOX)) + { + return true; + } + } + else if (m_poArrayXMinDouble && + !m_poArrayXMinDouble->IsNull(m_nIdxInBatch)) + { + sEnvelopeSkipToNextFeatureDueToBBOX.MinX = + m_poArrayXMinDouble->Value(m_nIdxInBatch); + sEnvelopeSkipToNextFeatureDueToBBOX.MinY = + m_poArrayYMinDouble->Value(m_nIdxInBatch); + sEnvelopeSkipToNextFeatureDueToBBOX.MaxX = + m_poArrayXMaxDouble->Value(m_nIdxInBatch); + sEnvelopeSkipToNextFeatureDueToBBOX.MaxY = + m_poArrayYMaxDouble->Value(m_nIdxInBatch); + if (!m_sFilterEnvelope.Intersects( + sEnvelopeSkipToNextFeatureDueToBBOX)) + { + return true; + } + } + } + return false; + }; + if (iCol >= 0 && m_aeGeomEncoding[m_iGeomFieldFilter] == OGRArrowGeomEncoding::WKB) { @@ -3496,19 +3525,9 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() } else { - if (m_poArrayMinX && - (!m_poArrayBBOX || - !m_poArrayBBOX->IsNull(m_nIdxInBatch)) && - !m_poArrayMinX->IsNull(m_nIdxInBatch)) + if (m_poArrayXMinFloat || m_poArrayXMinDouble) { - sEnvelope.MinX = m_poArrayMinX->Value(m_nIdxInBatch); - sEnvelope.MinY = m_poArrayMinY->Value(m_nIdxInBatch); - sEnvelope.MaxX = m_poArrayMaxX->Value(m_nIdxInBatch); - sEnvelope.MaxY = m_poArrayMaxY->Value(m_nIdxInBatch); - if (!m_sFilterEnvelope.Intersects(sEnvelope)) - { - bSkipToNextFeature = true; - } + bSkipToNextFeature = SkipToNextFeatureDueToBBOX(); } else if (m_poArrayWKB) { @@ -3592,7 +3611,12 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() while (true) { - if (!listOfPartsArray->IsNull(m_nIdxInBatch)) + bool bSkipToNextFeature = false; + if (m_poArrayXMinFloat || m_poArrayXMinDouble) + { + bSkipToNextFeature = SkipToNextFeatureDueToBBOX(); + } + else if (!listOfPartsArray->IsNull(m_nIdxInBatch)) { OGREnvelope sEnvelope; const auto nParts = @@ -3626,9 +3650,13 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() if (nParts != 0 && m_sFilterEnvelope.Intersects(sEnvelope)) { - break; + bSkipToNextFeature = true; } } + if (!bSkipToNextFeature) + { + break; + } if (!m_asAttributeFilterConstraints.empty() && !SkipToNextFeatureDueToAttributeFilter()) { @@ -3649,23 +3677,30 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() else if (iCol >= 0) { auto array = m_poBatchColumns[iCol].get(); - OGREnvelope sEnvelope; while (true) { bool bSkipToNextFeature = false; - auto poGeometry = std::unique_ptr( - ReadGeometry(m_iGeomFieldFilter, array, m_nIdxInBatch)); - if (poGeometry == nullptr || poGeometry->IsEmpty()) + if (m_poArrayXMinFloat || m_poArrayXMinDouble) { - bSkipToNextFeature = true; + bSkipToNextFeature = SkipToNextFeatureDueToBBOX(); } else { - poGeometry->getEnvelope(&sEnvelope); - if (!m_sFilterEnvelope.Intersects(sEnvelope)) + auto poGeometry = std::unique_ptr( + ReadGeometry(m_iGeomFieldFilter, array, m_nIdxInBatch)); + if (poGeometry == nullptr || poGeometry->IsEmpty()) { bSkipToNextFeature = true; } + else + { + OGREnvelope sEnvelope; + poGeometry->getEnvelope(&sEnvelope); + if (!m_sFilterEnvelope.Intersects(sEnvelope)) + { + bSkipToNextFeature = true; + } + } } if (!bSkipToNextFeature) { @@ -4316,7 +4351,15 @@ OGRArrowLayer::GetArrowSchemaInternal(struct ArrowSchema *out_schema) const { if (m_iFIDArrowColumn == i) { - j++; + out_schema->children[j] = out_schema->children[i]; + ++j; + } + else if (m_oSetBBoxArrowColumns.find(i) != + m_oSetBBoxArrowColumns.end()) + { + // Remove bounding box columns from exported schema + out_schema->children[i]->release(out_schema->children[i]); + out_schema->children[i] = nullptr; } else { @@ -4455,6 +4498,47 @@ inline int OGRArrowLayer::GetNextArrowArray(struct ArrowArrayStream *stream, return EIO; } + // Remove bounding box columns from exported array + const auto RemoveBBoxColumns = + [out_array, &schema](const std::set &oSetBBoxArrayIndex) + { + int j = 0; + for (int i = 0; i < static_cast(schema.n_children); ++i) + { + if (oSetBBoxArrayIndex.find(i) != oSetBBoxArrayIndex.end()) + { + out_array->children[i]->release(out_array->children[i]); + out_array->children[i] = nullptr; + + schema.children[i]->release(schema.children[i]); + schema.children[i] = nullptr; + } + else + { + out_array->children[j] = out_array->children[i]; + schema.children[j] = schema.children[i]; + ++j; + } + } + out_array->n_children = j; + schema.n_children = j; + }; + + if (m_bIgnoredFields) + { + std::set oSetBBoxArrayIndex; + for (const auto &iter : m_oMapGeomFieldIndexToGeomColBBOX) + { + if (iter.second.iArrayIdx >= 0) + oSetBBoxArrayIndex.insert(iter.second.iArrayIdx); + } + RemoveBBoxColumns(oSetBBoxArrayIndex); + } + else + { + RemoveBBoxColumns(m_oSetBBoxArrowColumns); + } + if (EQUAL(m_aosArrowArrayStreamOptions.FetchNameValueDef( "GEOMETRY_ENCODING", ""), "WKB")) diff --git a/ogr/ogrsf_frmts/arrow_common/ograrrowwriterlayer.hpp b/ogr/ogrsf_frmts/arrow_common/ograrrowwriterlayer.hpp index 118bd2d6cdb8..3f203b265901 100644 --- a/ogr/ogrsf_frmts/arrow_common/ograrrowwriterlayer.hpp +++ b/ogr/ogrsf_frmts/arrow_common/ograrrowwriterlayer.hpp @@ -37,6 +37,7 @@ #include "ogrlayerarrow.h" #include "ogr_wkb.h" +#include #include #include @@ -93,8 +94,10 @@ inline OGRArrowWriterLayer::~OGRArrowWriterLayer() /* FinalizeWriting() */ /************************************************************************/ -inline void OGRArrowWriterLayer::FinalizeWriting() +inline bool OGRArrowWriterLayer::FinalizeWriting() { + bool ret = true; + if (!IsFileWriterCreated()) { CreateWriter(); @@ -104,10 +107,13 @@ inline void OGRArrowWriterLayer::FinalizeWriting() PerformStepsBeforeFinalFlushGroup(); if (!m_apoBuilders.empty() && m_apoFieldsFromArrowSchema.empty()) - FlushGroup(); + ret = FlushGroup(); - CloseFileWriter(); + if (!CloseFileWriter()) + ret = false; } + + return ret; } /************************************************************************/ @@ -343,6 +349,25 @@ inline void OGRArrowWriterLayer::CreateSchemaCommon() fields.emplace_back(std::move(field)); } + if (m_bWriteBBoxStruct) + { + for (int i = 0; i < m_poFeatureDefn->GetGeomFieldCount(); ++i) + { + const auto poGeomFieldDefn = m_poFeatureDefn->GetGeomFieldDefn(i); + auto bbox_field_xmin(arrow::field("xmin", arrow::float32(), false)); + auto bbox_field_ymin(arrow::field("ymin", arrow::float32(), false)); + auto bbox_field_xmax(arrow::field("xmax", arrow::float32(), false)); + auto bbox_field_ymax(arrow::field("ymax", arrow::float32(), false)); + auto bbox_field(arrow::field( + std::string(poGeomFieldDefn->GetNameRef()).append("_bbox"), + arrow::struct_({bbox_field_xmin, bbox_field_ymin, + bbox_field_xmax, bbox_field_ymax}), + poGeomFieldDefn->IsNullable())); + fields.emplace_back(bbox_field); + m_apoFieldsBBOX.emplace_back(bbox_field); + } + } + m_aoEnvelopes.resize(m_poFeatureDefn->GetGeomFieldCount()); m_oSetWrittenGeometryTypes.resize(m_poFeatureDefn->GetGeomFieldCount()); @@ -743,6 +768,20 @@ MakeGeoArrowBuilder(arrow::MemoryPool *poMemoryPool, int nDim, int nDepth) poMemoryPool, MakeGeoArrowBuilder(poMemoryPool, nDim, nDepth - 1)); } +/************************************************************************/ +/* ClearArrayBuilers() */ +/************************************************************************/ + +inline void OGRArrowWriterLayer::ClearArrayBuilers() +{ + m_apoBuilders.clear(); + m_apoBuildersBBOXStruct.clear(); + m_apoBuildersBBOXXMin.clear(); + m_apoBuildersBBOXYMin.clear(); + m_apoBuildersBBOXXMax.clear(); + m_apoBuildersBBOXYMax.clear(); +} + /************************************************************************/ /* CreateArrayBuilders() */ /************************************************************************/ @@ -922,7 +961,61 @@ inline void OGRArrowWriterLayer::CreateArrayBuilders() CPLAssert(false); } m_apoBuilders.emplace_back(builder); + + if (m_bWriteBBoxStruct) + { + m_apoBuildersBBOXXMin.emplace_back( + std::make_shared(m_poMemoryPool)); + m_apoBuildersBBOXYMin.emplace_back( + std::make_shared(m_poMemoryPool)); + m_apoBuildersBBOXXMax.emplace_back( + std::make_shared(m_poMemoryPool)); + m_apoBuildersBBOXYMax.emplace_back( + std::make_shared(m_poMemoryPool)); + m_apoBuildersBBOXStruct.emplace_back( + std::make_shared( + m_apoFieldsBBOX[i]->type(), m_poMemoryPool, + std::vector>{ + m_apoBuildersBBOXXMin.back(), + m_apoBuildersBBOXYMin.back(), + m_apoBuildersBBOXXMax.back(), + m_apoBuildersBBOXYMax.back()})); + } + } +} + +/************************************************************************/ +/* castToFloatDown() */ +/************************************************************************/ + +// Cf https://github.com/sqlite/sqlite/blob/90e4a3b7fcdf63035d6f35eb44d11ff58ff4b068/ext/rtree/rtree.c#L2993C1-L2995C3 +/* +** Rounding constants for float->double conversion. +*/ +#define RNDTOWARDS (1.0 - 1.0 / 8388608.0) /* Round towards zero */ +#define RNDAWAY (1.0 + 1.0 / 8388608.0) /* Round away from zero */ + +/* +** Convert an sqlite3_value into an RtreeValue (presumably a float) +** while taking care to round toward negative or positive, respectively. +*/ +static float castToFloatDown(double d) +{ + float f = static_cast(d); + if (f > d) + { + f = static_cast(d * (d < 0 ? RNDAWAY : RNDTOWARDS)); + } + return f; +} +static float castToFloatUp(double d) +{ + float f = static_cast(d); + if (f < d) + { + f = static_cast(d * (d < 0 ? RNDTOWARDS : RNDAWAY)); } + return f; } /************************************************************************/ @@ -937,23 +1030,48 @@ inline OGRErr OGRArrowWriterLayer::BuildGeometry(OGRGeometry *poGeom, const auto eColumnGType = m_poFeatureDefn->GetGeomFieldDefn(iGeomField)->GetType(); const bool bIsEmpty = poGeom != nullptr && poGeom->IsEmpty(); + OGREnvelope3D oEnvelope; if (poGeom != nullptr && !bIsEmpty) { if (poGeom->Is3D()) { - OGREnvelope3D oEnvelope; poGeom->getEnvelope(&oEnvelope); m_aoEnvelopes[iGeomField].Merge(oEnvelope); } else { - OGREnvelope oEnvelope; - poGeom->getEnvelope(&oEnvelope); + poGeom->getEnvelope(static_cast(&oEnvelope)); m_aoEnvelopes[iGeomField].Merge(oEnvelope); } m_oSetWrittenGeometryTypes[iGeomField].insert(eGType); } + if (m_bWriteBBoxStruct) + { + if (poGeom && !bIsEmpty) + { + OGR_ARROW_RETURN_OGRERR_NOT_OK( + m_apoBuildersBBOXXMin[iGeomField]->Append( + castToFloatDown(oEnvelope.MinX))); + OGR_ARROW_RETURN_OGRERR_NOT_OK( + m_apoBuildersBBOXYMin[iGeomField]->Append( + castToFloatDown(oEnvelope.MinY))); + OGR_ARROW_RETURN_OGRERR_NOT_OK( + m_apoBuildersBBOXXMax[iGeomField]->Append( + castToFloatUp(oEnvelope.MaxX))); + OGR_ARROW_RETURN_OGRERR_NOT_OK( + m_apoBuildersBBOXYMax[iGeomField]->Append( + castToFloatUp(oEnvelope.MaxY))); + OGR_ARROW_RETURN_OGRERR_NOT_OK( + m_apoBuildersBBOXStruct[iGeomField]->Append()); + } + else + { + OGR_ARROW_RETURN_OGRERR_NOT_OK( + m_apoBuildersBBOXStruct[iGeomField]->AppendNull()); + } + } + if (poGeom == nullptr) { if (m_aeGeomEncoding[iGeomField] == @@ -1654,20 +1772,32 @@ inline OGRErr OGRArrowWriterLayer::ICreateFeature(OGRFeature *poFeature) // Flush the current row group if reaching the limit of rows per group. if (!m_apoBuilders.empty() && m_apoBuilders[0]->length() == m_nRowGroupSize) { - if (!IsFileWriterCreated()) - { - CreateWriter(); - if (!IsFileWriterCreated()) - return OGRERR_FAILURE; - } - - if (!FlushGroup()) + if (!FlushFeatures()) return OGRERR_FAILURE; } return OGRERR_NONE; } +/************************************************************************/ +/* FlushFeatures() */ +/************************************************************************/ + +inline bool OGRArrowWriterLayer::FlushFeatures() +{ + if (m_apoBuilders.empty() || m_apoBuilders[0]->length() == 0) + return true; + + if (!IsFileWriterCreated()) + { + CreateWriter(); + if (!IsFileWriterCreated()) + return false; + } + + return FlushGroup(); +} + /************************************************************************/ /* GetFeatureCount() */ /************************************************************************/ @@ -1767,6 +1897,30 @@ inline bool OGRArrowWriterLayer::WriteArrays( nArrowIdx++; } + + if (m_bWriteBBoxStruct) + { + const int nGeomFieldCount = m_poFeatureDefn->GetGeomFieldCount(); + for (int i = 0; i < nGeomFieldCount; ++i) + { + const auto &field = m_apoFieldsBBOX[i]; + std::shared_ptr array; + auto status = m_apoBuildersBBOXStruct[i]->Finish(&array); + if (!status.ok()) + { + CPLError(CE_Failure, CPLE_AppDefined, + "builder::Finish() for field %s failed with %s", + field->name().c_str(), status.message().c_str()); + return false; + } + + if (!postProcessArray(field, array)) + { + return false; + } + } + } + return true; } @@ -1805,6 +1959,10 @@ inline bool OGRArrowWriterLayer::WriteArrowBatchInternal( CreateArrayBuilders(); } + const int nGeomFieldCount = m_poFeatureDefn->GetGeomFieldCount(); + const int nGeomFieldCountBBoxFields = + m_bWriteBBoxStruct ? nGeomFieldCount : 0; + const char *pszFIDName = CSLFetchNameValueDef( papszOptions, "FID", OGRLayer::DEFAULT_ARROW_FID_NAME); const char *pszSingleGeomFieldName = @@ -1847,15 +2005,17 @@ inline bool OGRArrowWriterLayer::WriteArrowBatchInternal( struct ArrowSchema fidSchema; memset(&fidArray, 0, sizeof(fidArray)); memset(&fidSchema, 0, sizeof(fidSchema)); - std::vector apBuffers; + std::vector apBuffersFid; std::vector fids; + std::set oSetReferencedFieldsInArraySchema; const auto DummyFreeArray = [](struct ArrowArray *ptrArray) { ptrArray->release = nullptr; }; const auto DummyFreeSchema = [](struct ArrowSchema *ptrSchema) { ptrSchema->release = nullptr; }; bool bRebuildBatch = false; - for (int i = 0; i < m_poSchema->num_fields(); ++i) + for (int i = 0; i < m_poSchema->num_fields() - nGeomFieldCountBBoxFields; + ++i) { auto oIter = oMapSchemaChildrenNameToIdx.find(m_poSchema->field(i)->name()); @@ -1870,9 +2030,9 @@ inline bool OGRArrowWriterLayer::WriteArrowBatchInternal( // the output file requires it, creates a default FID column fidArray.release = DummyFreeArray; fidArray.n_buffers = 2; - apBuffers.resize(2); + apBuffersFid.resize(2); fidArray.buffers = - const_cast(apBuffers.data()); + const_cast(apBuffersFid.data()); fids.reserve(static_cast(array->length)); for (size_t iRow = 0; iRow < static_cast(array->length); ++iRow) @@ -1885,7 +2045,7 @@ inline bool OGRArrowWriterLayer::WriteArrowBatchInternal( continue; } } - else if (m_poFeatureDefn->GetGeomFieldCount() == 1 && + else if (nGeomFieldCount == 1 && m_poFeatureDefn->GetGeomFieldIndex( m_poSchema->field(i)->name().c_str()) == 0) { @@ -1907,23 +2067,173 @@ inline bool OGRArrowWriterLayer::WriteArrowBatchInternal( oSetReferencedFieldsInArraySchema.insert(oIter->second); } - std::vector newSchemaChildren( - m_poSchema->num_fields()); - std::vector newArrayChildren(m_poSchema->num_fields()); - for (int i = 0; i < m_poSchema->num_fields(); ++i) + std::vector newSchemaChildren; + std::vector newArrayChildren; + newSchemaChildren.reserve(m_poSchema->num_fields()); + newArrayChildren.reserve(m_poSchema->num_fields()); + for (int i = 0; i < m_poSchema->num_fields() - nGeomFieldCountBBoxFields; + ++i) { if (anMapLayerSchemaToArraySchema[i] < 0) { CPLAssert(m_poSchema->field(i)->name() == m_osFIDColumn); - newSchemaChildren[i] = &fidSchema; - newArrayChildren[i] = &fidArray; + newSchemaChildren.emplace_back(&fidSchema); + newArrayChildren.emplace_back(&fidArray); } else { - newSchemaChildren[i] = - schema->children[anMapLayerSchemaToArraySchema[i]]; - newArrayChildren[i] = - array->children[anMapLayerSchemaToArraySchema[i]]; + newSchemaChildren.emplace_back( + schema->children[anMapLayerSchemaToArraySchema[i]]); + newArrayChildren.emplace_back( + array->children[anMapLayerSchemaToArraySchema[i]]); + } + } + + // Temporary arrays to hold the geometry bounding boxes. + std::vector bboxStructArray; + std::vector bboxStructSchema; + // Note: we cheat a bit by declaring a single instance of the minx/miny/ + // maxx/maxy sub-field ArrowSchema*, and make all struct ArrowSchema point + // to them. That's OK because we use DummyFreeSchema to release, which does + // nothing. + struct ArrowSchema bboxStructSchemaXMin; + struct ArrowSchema bboxStructSchemaYMin; + struct ArrowSchema bboxStructSchemaXMax; + struct ArrowSchema bboxStructSchemaYMax; + constexpr int BBOX_SUBFIELD_COUNT = 4; + std::array + bboxStructSchemaChildren; + constexpr int BBOX_STRUCT_BUFFER_COUNT = 1; // validity bitmap array + // cppcheck-suppress constStatement + std::vector> + bboxStructBuffersPtr; + std::vector> aabyBboxStructValidity; + std::vector> aadfMinX, aadfMinY, aadfMaxX, aadfMaxY; + // cppcheck-suppress constStatement + std::vector> bboxArrays; + // cppcheck-suppress constStatement + std::vector> + bboxArraysPtr; + constexpr int BBOX_SUBFIELD_BUFFER_COUNT = + 2; // validity bitmap array and float array + std::vector, + BBOX_SUBFIELD_COUNT>> + bboxBuffersPtr; + if (m_bWriteBBoxStruct) + { + memset(&bboxStructSchemaXMin, 0, sizeof(bboxStructSchemaXMin)); + memset(&bboxStructSchemaYMin, 0, sizeof(bboxStructSchemaYMin)); + memset(&bboxStructSchemaXMax, 0, sizeof(bboxStructSchemaXMax)); + memset(&bboxStructSchemaYMax, 0, sizeof(bboxStructSchemaYMax)); + + bboxStructSchemaXMin.release = DummyFreeSchema; + bboxStructSchemaXMin.name = "xmin"; + bboxStructSchemaXMin.format = "f"; // float32 + + bboxStructSchemaYMin.release = DummyFreeSchema; + bboxStructSchemaYMin.name = "ymin"; + bboxStructSchemaYMin.format = "f"; // float32 + + bboxStructSchemaXMax.release = DummyFreeSchema; + bboxStructSchemaXMax.name = "xmax"; + bboxStructSchemaXMax.format = "f"; // float32 + + bboxStructSchemaYMax.release = DummyFreeSchema; + bboxStructSchemaYMax.name = "ymax"; + bboxStructSchemaYMax.format = "f"; // float32 + + try + { + constexpr int XMIN_IDX = 0; + constexpr int YMIN_IDX = 1; + constexpr int XMAX_IDX = 2; + constexpr int YMAX_IDX = 3; + bboxStructSchemaChildren[XMIN_IDX] = &bboxStructSchemaXMin; + // cppcheck-suppress objectIndex + bboxStructSchemaChildren[YMIN_IDX] = &bboxStructSchemaYMin; + // cppcheck-suppress objectIndex + bboxStructSchemaChildren[XMAX_IDX] = &bboxStructSchemaXMax; + // cppcheck-suppress objectIndex + bboxStructSchemaChildren[YMAX_IDX] = &bboxStructSchemaYMax; + + bboxStructArray.resize(nGeomFieldCount); + bboxStructSchema.resize(nGeomFieldCount); + bboxArrays.resize(nGeomFieldCount); + bboxArraysPtr.resize(nGeomFieldCount); + bboxBuffersPtr.resize(nGeomFieldCount); + bboxStructBuffersPtr.resize(nGeomFieldCount); + aabyBboxStructValidity.resize(nGeomFieldCount); + memset(bboxStructArray.data(), 0, + nGeomFieldCount * sizeof(bboxStructArray[0])); + memset(bboxStructSchema.data(), 0, + nGeomFieldCount * sizeof(bboxStructSchema[0])); + memset(bboxArrays.data(), 0, + nGeomFieldCount * sizeof(bboxArrays[0])); + aadfMinX.resize(nGeomFieldCount); + aadfMinY.resize(nGeomFieldCount); + aadfMaxX.resize(nGeomFieldCount); + aadfMaxY.resize(nGeomFieldCount); + for (int i = 0; i < nGeomFieldCount; ++i) + { + const bool bIsNullable = CPL_TO_BOOL( + m_poFeatureDefn->GetGeomFieldDefn(i)->IsNullable()); + aadfMinX[i].reserve(static_cast(array->length)); + aadfMinY[i].reserve(static_cast(array->length)); + aadfMaxX[i].reserve(static_cast(array->length)); + aadfMaxY[i].reserve(static_cast(array->length)); + aabyBboxStructValidity[i].resize( + static_cast(array->length + 7) / 8, 0xFF); + + bboxStructSchema[i].release = DummyFreeSchema; + bboxStructSchema[i].name = m_apoFieldsBBOX[i]->name().c_str(); + bboxStructSchema[i].format = "+s"; // structure + bboxStructSchema[i].flags = + bIsNullable ? ARROW_FLAG_NULLABLE : 0; + bboxStructSchema[i].n_children = BBOX_SUBFIELD_COUNT; + bboxStructSchema[i].children = bboxStructSchemaChildren.data(); + + constexpr int VALIDITY_ARRAY_IDX = 0; + constexpr int BBOX_SUBFIELD_FLOAT_VALUE_IDX = 1; + bboxBuffersPtr[i][XMIN_IDX][BBOX_SUBFIELD_FLOAT_VALUE_IDX] = + aadfMinX[i].data(); + bboxBuffersPtr[i][YMIN_IDX][BBOX_SUBFIELD_FLOAT_VALUE_IDX] = + aadfMinY[i].data(); + bboxBuffersPtr[i][XMAX_IDX][BBOX_SUBFIELD_FLOAT_VALUE_IDX] = + aadfMaxX[i].data(); + bboxBuffersPtr[i][YMAX_IDX][BBOX_SUBFIELD_FLOAT_VALUE_IDX] = + aadfMaxY[i].data(); + + for (int j = 0; j < BBOX_SUBFIELD_COUNT; ++j) + { + bboxBuffersPtr[i][j][VALIDITY_ARRAY_IDX] = nullptr; + + bboxArrays[i][j].release = DummyFreeArray; + bboxArrays[i][j].length = array->length; + bboxArrays[i][j].n_buffers = BBOX_SUBFIELD_BUFFER_COUNT; + bboxArrays[i][j].buffers = bboxBuffersPtr[i][j].data(); + + bboxArraysPtr[i][j] = &bboxArrays[i][j]; + } + + bboxStructArray[i].release = DummyFreeArray; + bboxStructArray[i].n_children = BBOX_SUBFIELD_COUNT; + bboxStructArray[i].children = bboxArraysPtr[i].data(); + bboxStructArray[i].length = array->length; + bboxStructArray[i].n_buffers = BBOX_STRUCT_BUFFER_COUNT; + bboxStructBuffersPtr[i][VALIDITY_ARRAY_IDX] = + bIsNullable ? aabyBboxStructValidity[i].data() : nullptr; + bboxStructArray[i].buffers = bboxStructBuffersPtr[i].data(); + + newSchemaChildren.emplace_back(&bboxStructSchema[i]); + newArrayChildren.emplace_back(&bboxStructArray[i]); + } + } + catch (const std::bad_alloc &) + { + CPLError(CE_Failure, CPLE_OutOfMemory, + "Out of memory in " + "OGRArrowWriterLayer::WriteArrowBatchInternal()"); + return false; } } @@ -2018,15 +2328,14 @@ inline bool OGRArrowWriterLayer::WriteArrowBatchInternal( } std::map> oMapGeomFieldNameToArray; - for (int i = 0; i < m_poFeatureDefn->GetGeomFieldCount(); - ++i, ++nBuilderIdx) + for (int i = 0; i < nGeomFieldCount; ++i, ++nBuilderIdx) { const char *pszThisGeomFieldName = m_poFeatureDefn->GetGeomFieldDefn(i)->GetNameRef(); int nIdx = poSchema->GetFieldIndex(pszThisGeomFieldName); if (nIdx < 0) { - if (m_poFeatureDefn->GetGeomFieldCount() == 1) + if (nGeomFieldCount == 1) nIdx = poSchema->GetFieldIndex(pszSingleGeomFieldName); if (nIdx < 0) { @@ -2067,6 +2376,8 @@ inline bool OGRArrowWriterLayer::WriteArrowBatchInternal( for (size_t iRow = 0; iRow < static_cast(psGeomArray->length); ++iRow) { + bool bValidGeom = false; + if (!pabyValidity || TestBit(pabyValidity, iRow + psGeomArray->offset)) { @@ -2091,7 +2402,20 @@ inline bool OGRArrowWriterLayer::WriteArrowBatchInternal( static_cast(nType)); if (OGRWKBGetBoundingBox(pabyWkb, nLen, sEnvelope)) { + bValidGeom = true; m_aoEnvelopes[i].Merge(sEnvelope); + + if (m_bWriteBBoxStruct) + { + aadfMinX[i].push_back( + castToFloatDown(sEnvelope.MinX)); + aadfMinY[i].push_back( + castToFloatDown(sEnvelope.MinY)); + aadfMaxX[i].push_back( + castToFloatUp(sEnvelope.MaxX)); + aadfMaxY[i].push_back( + castToFloatUp(sEnvelope.MaxY)); + } } } } @@ -2107,13 +2431,39 @@ inline bool OGRArrowWriterLayer::WriteArrowBatchInternal( delete poGeometry; return false; } + bValidGeom = true; + if (m_bWriteBBoxStruct) + { + poGeometry->getEnvelope(&sEnvelope); + aadfMinX[i].push_back(castToFloatDown(sEnvelope.MinX)); + aadfMinY[i].push_back(castToFloatDown(sEnvelope.MinY)); + aadfMaxX[i].push_back(castToFloatUp(sEnvelope.MaxX)); + aadfMaxY[i].push_back(castToFloatUp(sEnvelope.MaxY)); + } delete poGeometry; } } - else if (m_aeGeomEncoding[i] != OGRArrowGeomEncoding::WKB) + else { - if (BuildGeometry(nullptr, i, poBuilder) != OGRERR_NONE) - return false; + if (m_aeGeomEncoding[i] != OGRArrowGeomEncoding::WKB) + { + if (BuildGeometry(nullptr, i, poBuilder) != OGRERR_NONE) + return false; + } + } + + if (!bValidGeom && m_bWriteBBoxStruct) + { + if ((bboxStructSchema[i].flags & ARROW_FLAG_NULLABLE)) + { + bboxStructArray[i].null_count++; + aabyBboxStructValidity[i][iRow / 8] &= + ~(1 << static_cast(iRow % 8)); + } + aadfMinX[i].push_back(0.0f); + aadfMinY[i].push_back(0.0f); + aadfMaxX[i].push_back(0.0f); + aadfMaxY[i].push_back(0.0f); } } diff --git a/ogr/ogrsf_frmts/parquet/ogr_parquet.h b/ogr/ogrsf_frmts/parquet/ogr_parquet.h index 6880801ede3e..9c7e6440860d 100644 --- a/ogr/ogrsf_frmts/parquet/ogr_parquet.h +++ b/ogr/ogrsf_frmts/parquet/ogr_parquet.h @@ -94,7 +94,24 @@ class OGRParquetLayer final : public OGRParquetLayerBase #endif CPLStringList m_aosFeatherMetadata{}; + //! Describe the bbox column of a geometry column + struct GeomColBBOXParquet + { + int iParquetXMin = -1; + int iParquetYMin = -1; + int iParquetXMax = -1; + int iParquetYMax = -1; + std::vector anParquetCols{}; + }; + //! Map from OGR geometry field index to GeomColBBOXParquet + std::map + m_oMapGeomFieldIndexToGeomColBBOXParquet{}; + void EstablishFeatureDefn(); + void ProcessGeometryColumnCovering( + const std::shared_ptr &field, + const CPLJSONObject &oJSONGeometryColumn, + const std::map &oMapParquetColumnNameToIdx); bool CreateRecordBatchReader(int iStartingRowGroup); bool CreateRecordBatchReader(const std::vector &anRowGroups); bool ReadNextBatch() override; @@ -162,12 +179,27 @@ class OGRParquetLayer final : public OGRParquetLayerBase } static constexpr int OGR_FID_INDEX = -2; - bool GetMinMaxForField(int iRowGroup, // -1 for all - int iOGRField, // or OGR_FID_INDEX - bool bComputeMin, OGRField &sMin, bool &bFoundMin, - bool bComputeMax, OGRField &sMax, bool &bFoundMax, - OGRFieldType &eType, OGRFieldSubType &eSubType, - std::string &osMinTmp, std::string &osMaxTmp) const; + bool GetMinMaxForOGRField(int iRowGroup, // -1 for all + int iOGRField, // or OGR_FID_INDEX + bool bComputeMin, OGRField &sMin, bool &bFoundMin, + bool bComputeMax, OGRField &sMax, bool &bFoundMax, + OGRFieldType &eType, OGRFieldSubType &eSubType, + std::string &osMinTmp, + std::string &osMaxTmp) const; + + bool GetMinMaxForParquetCol(int iRowGroup, // -1 for all + int iCol, + const std::shared_ptr + &arrowType, // potentially nullptr + bool bComputeMin, OGRField &sMin, + bool &bFoundMin, bool bComputeMax, + OGRField &sMax, bool &bFoundMax, + OGRFieldType &eType, OGRFieldSubType &eSubType, + std::string &osMinTmp, + std::string &osMaxTmp) const; + + bool GeomColsBBOXParquet(int iGeom, int &iParquetXMin, int &iParquetYMin, + int &iParquetXMax, int &iParquetYMax) const; }; /************************************************************************/ @@ -253,12 +285,19 @@ class OGRParquetWriterLayer final : public OGRArrowWriterLayer bool m_bEdgesSpherical = false; parquet::WriterProperties::Builder m_oWriterPropertiesBuilder{}; + //! Temporary GeoPackage dataset. Only used in SORT_BY_BBOX mode + std::unique_ptr m_poTmpGPKG{}; + //! Temporary GeoPackage layer. Only used in SORT_BY_BBOX mode + OGRLayer *m_poTmpGPKGLayer = nullptr; + //! Number of features written by ICreateFeature(). Only used in SORT_BY_BBOX mode + GIntBig m_nTmpFeatureCount = 0; + virtual bool IsFileWriterCreated() const override { return m_poFileWriter != nullptr; } virtual void CreateWriter() override; - virtual void CloseFileWriter() override; + virtual bool CloseFileWriter() override; virtual void CreateSchema() override; virtual void PerformStepsBeforeFinalFlushGroup() override; @@ -283,14 +322,15 @@ class OGRParquetWriterLayer final : public OGRArrowWriterLayer std::string GetGeoMetadata() const; + //! Copy temporary GeoPackage layer to final Parquet file + bool CopyTmpGpkgLayerToFinalFile(); + public: OGRParquetWriterLayer( OGRParquetWriterDataset *poDS, arrow::MemoryPool *poMemoryPool, const std::shared_ptr &poOutputStream, const char *pszLayerName); - ~OGRParquetWriterLayer() override; - CPLErr SetMetadata(char **papszMetadata, const char *pszDomain) override; bool SetOptions(CSLConstList papszOptions, @@ -326,10 +366,19 @@ class OGRParquetWriterLayer final : public OGRArrowWriterLayer bool IsArrowSchemaSupported(const struct ArrowSchema *schema, CSLConstList papszOptions, std::string &osErrorMsg) const override; + bool + CreateFieldFromArrowSchema(const struct ArrowSchema *schema, + CSLConstList papszOptions = nullptr) override; bool WriteArrowBatch(const struct ArrowSchema *schema, struct ArrowArray *array, CSLConstList papszOptions = nullptr) override; #endif + + protected: + OGRErr ICreateFeature(OGRFeature *poFeature) override; + + friend class OGRParquetWriterDataset; + bool Close(); }; /************************************************************************/ @@ -351,6 +400,8 @@ class OGRParquetWriterDataset final : public GDALPamDataset return m_poMemoryPool.get(); } + CPLErr Close() override; + int GetLayerCount() override; OGRLayer *GetLayer(int idx) override; int TestCapability(const char *pszCap) override; diff --git a/ogr/ogrsf_frmts/parquet/ogrparquetdataset.cpp b/ogr/ogrsf_frmts/parquet/ogrparquetdataset.cpp index 812b872753a6..a8d53bc139db 100644 --- a/ogr/ogrsf_frmts/parquet/ogrparquetdataset.cpp +++ b/ogr/ogrsf_frmts/parquet/ogrparquetdataset.cpp @@ -137,19 +137,21 @@ OGRLayer *OGRParquetDataset::ExecuteSQL(const char *pszSQLCommand, if (col_func == SWQCF_MIN) { - CPL_IGNORE_RET_VAL(poLayer->GetMinMaxForField( - /* iRowGroup=*/-1, // -1 for all - iOGRField, true, sField, bFound, false, - sFieldDummy, bFoundDummy, eType, eSubType, - sVal, sValDummy)); + CPL_IGNORE_RET_VAL( + poLayer->GetMinMaxForOGRField( + /* iRowGroup=*/-1, // -1 for all + iOGRField, true, sField, bFound, false, + sFieldDummy, bFoundDummy, eType, + eSubType, sVal, sValDummy)); } else if (col_func == SWQCF_MAX) { - CPL_IGNORE_RET_VAL(poLayer->GetMinMaxForField( - /* iRowGroup=*/-1, // -1 for all - iOGRField, false, sFieldDummy, bFoundDummy, - true, sField, bFound, eType, eSubType, - sValDummy, sVal)); + CPL_IGNORE_RET_VAL( + poLayer->GetMinMaxForOGRField( + /* iRowGroup=*/-1, // -1 for all + iOGRField, false, sFieldDummy, + bFoundDummy, true, sField, bFound, + eType, eSubType, sValDummy, sVal)); } else if (col_func == SWQCF_COUNT) { diff --git a/ogr/ogrsf_frmts/parquet/ogrparquetdriver.cpp b/ogr/ogrsf_frmts/parquet/ogrparquetdriver.cpp index 407cb4ce62a1..746032728958 100644 --- a/ogr/ogrsf_frmts/parquet/ogrparquetdriver.cpp +++ b/ogr/ogrsf_frmts/parquet/ogrparquetdriver.cpp @@ -363,6 +363,124 @@ OpenParquetDatasetWithoutMetadata(const std::string &osBasePathIn, #endif +/************************************************************************/ +/* BuildMemDatasetWithRowGroupExtents() */ +/************************************************************************/ + +/** Builds a Memory dataset that contains, for each row-group of the input file, + * the feature count and spatial extent of the features of this row group, + * using Parquet statistics. This assumes that the Parquet file declares + * a "covering":{"bbox":{ ... }} metadata item. + * + * Only for debug purposes. + */ +static GDALDataset *BuildMemDatasetWithRowGroupExtents(OGRParquetLayer *poLayer) +{ + int iParquetXMin = -1; + int iParquetYMin = -1; + int iParquetXMax = -1; + int iParquetYMax = -1; + if (poLayer->GeomColsBBOXParquet(0, iParquetXMin, iParquetYMin, + iParquetXMax, iParquetYMax)) + { + auto poMemDrv = GetGDALDriverManager()->GetDriverByName("Memory"); + if (!poMemDrv) + return nullptr; + auto poMemDS = std::unique_ptr( + poMemDrv->Create("", 0, 0, 0, GDT_Unknown, nullptr)); + if (!poMemDS) + return nullptr; + OGRSpatialReference *poTmpSRS = nullptr; + const auto poSrcSRS = poLayer->GetSpatialRef(); + if (poSrcSRS) + poTmpSRS = poSrcSRS->Clone(); + auto poMemLayer = + poMemDS->CreateLayer("footprint", poTmpSRS, wkbPolygon, nullptr); + if (poTmpSRS) + poTmpSRS->Release(); + if (!poMemLayer) + return nullptr; + poMemLayer->CreateField( + std::make_unique("feature_count", OFTInteger64) + .get()); + + const auto metadata = + poLayer->GetReader()->parquet_reader()->metadata(); + const int numRowGroups = metadata->num_row_groups(); + for (int iRowGroup = 0; iRowGroup < numRowGroups; ++iRowGroup) + { + std::string osMinTmp, osMaxTmp; + OGRField unusedF; + bool unusedB; + OGRFieldSubType unusedSubType; + + OGRField sXMin; + OGR_RawField_SetNull(&sXMin); + bool bFoundXMin = false; + OGRFieldType eXMinType = OFTMaxType; + + OGRField sYMin; + OGR_RawField_SetNull(&sYMin); + bool bFoundYMin = false; + OGRFieldType eYMinType = OFTMaxType; + + OGRField sXMax; + OGR_RawField_SetNull(&sXMax); + bool bFoundXMax = false; + OGRFieldType eXMaxType = OFTMaxType; + + OGRField sYMax; + OGR_RawField_SetNull(&sYMax); + bool bFoundYMax = false; + OGRFieldType eYMaxType = OFTMaxType; + + if (poLayer->GetMinMaxForParquetCol( + iRowGroup, iParquetXMin, nullptr, + /* bComputeMin = */ true, sXMin, bFoundXMin, + /* bComputeMax = */ false, unusedF, unusedB, eXMinType, + unusedSubType, osMinTmp, osMaxTmp) && + bFoundXMin && eXMinType == OFTReal && + poLayer->GetMinMaxForParquetCol( + iRowGroup, iParquetYMin, nullptr, + /* bComputeMin = */ true, sYMin, bFoundYMin, + /* bComputeMax = */ false, unusedF, unusedB, eYMinType, + unusedSubType, osMinTmp, osMaxTmp) && + bFoundYMin && eYMinType == OFTReal && + poLayer->GetMinMaxForParquetCol( + iRowGroup, iParquetXMax, nullptr, + /* bComputeMin = */ false, unusedF, unusedB, + /* bComputeMax = */ true, sXMax, bFoundXMax, eXMaxType, + unusedSubType, osMaxTmp, osMaxTmp) && + bFoundXMax && eXMaxType == OFTReal && + poLayer->GetMinMaxForParquetCol( + iRowGroup, iParquetYMax, nullptr, + /* bComputeMin = */ false, unusedF, unusedB, + /* bComputeMax = */ true, sYMax, bFoundYMax, eYMaxType, + unusedSubType, osMaxTmp, osMaxTmp) && + bFoundYMax && eYMaxType == OFTReal) + { + OGRFeature oFeat(poMemLayer->GetLayerDefn()); + oFeat.SetField(0, + static_cast( + metadata->RowGroup(iRowGroup)->num_rows())); + auto poPoly = std::make_unique(); + auto poLR = std::make_unique(); + poLR->addPoint(sXMin.Real, sYMin.Real); + poLR->addPoint(sXMin.Real, sYMax.Real); + poLR->addPoint(sXMax.Real, sYMax.Real); + poLR->addPoint(sXMax.Real, sYMin.Real); + poLR->addPoint(sXMin.Real, sYMin.Real); + poPoly->addRingDirectly(poLR.release()); + oFeat.SetGeometryDirectly(poPoly.release()); + CPL_IGNORE_RET_VAL(poMemLayer->CreateFeature(&oFeat)); + } + } + + return poMemDS.release(); + } + return nullptr; +} + /************************************************************************/ /* Open() */ /************************************************************************/ @@ -533,6 +651,14 @@ static GDALDataset *OGRParquetDriverOpen(GDALOpenInfo *poOpenInfo) auto poLayer = std::make_unique( poDS.get(), CPLGetBasename(osFilename.c_str()), std::move(arrow_reader), poOpenInfo->papszOpenOptions); + + // For debug purposes: return a layer with the extent of each row group + if (CPLTestBool( + CPLGetConfigOption("OGR_PARQUET_SHOW_ROW_GROUP_EXTENT", "NO"))) + { + return BuildMemDatasetWithRowGroupExtents(poLayer.get()); + } + poDS->SetLayer(std::move(poLayer)); return poDS.release(); } @@ -738,6 +864,27 @@ void OGRParquetDriver::InitMetadata() "Name of creating application"); } + { + auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option"); + CPLAddXMLAttributeAndValue(psOption, "name", "WRITE_COVERING_BBOX"); + CPLAddXMLAttributeAndValue(psOption, "type", "boolean"); + CPLAddXMLAttributeAndValue(psOption, "default", "YES"); + CPLAddXMLAttributeAndValue(psOption, "description", + "Whether to write xmin/ymin/xmax/ymax " + "columns with the bounding box of " + "geometries"); + } + + { + auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option"); + CPLAddXMLAttributeAndValue(psOption, "name", "SORT_BY_BBOX"); + CPLAddXMLAttributeAndValue(psOption, "type", "boolean"); + CPLAddXMLAttributeAndValue(psOption, "default", "NO"); + CPLAddXMLAttributeAndValue(psOption, "description", + "Whether features should be sorted based on " + "the bounding box of their geometries"); + } + char *pszXML = CPLSerializeXMLTree(oTree.get()); GDALDriver::SetMetadataItem(GDAL_DS_LAYER_CREATIONOPTIONLIST, pszXML); CPLFree(pszXML); diff --git a/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp b/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp index 4ce27f5a4876..d8c092bb1ffe 100644 --- a/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp +++ b/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp @@ -117,6 +117,67 @@ void OGRParquetLayerBase::LoadGeoMetadata( } } +/************************************************************************/ +/* ParseGeometryColumnCovering() */ +/************************************************************************/ + +//! Parse bounding box column definition +static bool ParseGeometryColumnCovering(const CPLJSONObject &oJSONDef, + std::string &osBBOXColumn, + std::string &osXMin, + std::string &osYMin, + std::string &osXMax, + std::string &osYMax) +{ + const auto oCovering = oJSONDef["covering"]; + if (oCovering.IsValid() && + oCovering.GetType() == CPLJSONObject::Type::Object) + { + const auto oBBOX = oCovering["bbox"]; + if (oBBOX.IsValid() && oBBOX.GetType() == CPLJSONObject::Type::Object) + { + const auto oXMin = oBBOX["xmin"]; + const auto oYMin = oBBOX["ymin"]; + const auto oXMax = oBBOX["xmax"]; + const auto oYMax = oBBOX["ymax"]; + if (oXMin.IsValid() && oYMin.IsValid() && oXMax.IsValid() && + oYMax.IsValid() && + oXMin.GetType() == CPLJSONObject::Type::Array && + oYMin.GetType() == CPLJSONObject::Type::Array && + oXMax.GetType() == CPLJSONObject::Type::Array && + oYMax.GetType() == CPLJSONObject::Type::Array) + { + const auto osXMinArray = oXMin.ToArray(); + const auto osYMinArray = oYMin.ToArray(); + const auto osXMaxArray = oXMax.ToArray(); + const auto osYMaxArray = oYMax.ToArray(); + if (osXMinArray.Size() == 2 && osYMinArray.Size() == 2 && + osXMaxArray.Size() == 2 && osYMaxArray.Size() == 2 && + osXMinArray[0].GetType() == CPLJSONObject::Type::String && + osXMinArray[1].GetType() == CPLJSONObject::Type::String && + osYMinArray[0].GetType() == CPLJSONObject::Type::String && + osYMinArray[1].GetType() == CPLJSONObject::Type::String && + osXMaxArray[0].GetType() == CPLJSONObject::Type::String && + osXMaxArray[1].GetType() == CPLJSONObject::Type::String && + osYMaxArray[0].GetType() == CPLJSONObject::Type::String && + osYMaxArray[1].GetType() == CPLJSONObject::Type::String && + osXMinArray[0].ToString() == osYMinArray[0].ToString() && + osXMinArray[0].ToString() == osXMaxArray[0].ToString() && + osXMinArray[0].ToString() == osYMaxArray[0].ToString()) + { + osBBOXColumn = osXMinArray[0].ToString(); + osXMin = osXMinArray[1].ToString(); + osYMin = osYMinArray[1].ToString(); + osXMax = osXMaxArray[1].ToString(); + osYMax = osYMaxArray[1].ToString(); + return true; + } + } + } + } + return false; +} + /************************************************************************/ /* DealWithGeometryColumn() */ /************************************************************************/ @@ -457,8 +518,87 @@ void OGRParquetLayer::EstablishFeatureDefn() return; } + const bool bUseBBOX = CPLTestBool(CPLGetConfigOption( + ("OGR_" + GetDriverUCName() + "_USE_BBOX").c_str(), "YES")); + + // Keep track of declared bounding box columns in GeoParquet JSON metadata, + // in order not to expose them as regular fields. + std::set oSetBBOXColumns; + if (bUseBBOX) + { + for (const auto &iter : m_oMapGeometryColumns) + { + std::string osBBOXColumn; + std::string osXMin, osYMin, osXMax, osYMax; + if (ParseGeometryColumnCovering(iter.second, osBBOXColumn, osXMin, + osYMin, osXMax, osYMax)) + { + oSetBBOXColumns.insert(osBBOXColumn); + } + } + } + const auto &fields = m_poSchema->fields(); const auto poParquetSchema = metadata->schema(); + + // Map from Parquet column name (with dot separator) to Parquet index + std::map oMapParquetColumnNameToIdx; + const int nParquetColumns = poParquetSchema->num_columns(); + for (int iParquetCol = 0; iParquetCol < nParquetColumns; ++iParquetCol) + { + const auto parquetColumn = poParquetSchema->Column(iParquetCol); + const auto parquetColumnName = parquetColumn->path()->ToDotString(); + oMapParquetColumnNameToIdx[parquetColumnName] = iParquetCol; + } + + // Synthetize a GeoParquet bounding box column definition when detecting + // a Overture Map dataset + if (m_oMapGeometryColumns.empty() && bUseBBOX && + oMapParquetColumnNameToIdx.find("geometry") != + oMapParquetColumnNameToIdx.end() && + oMapParquetColumnNameToIdx.find("bbox.minx") != + oMapParquetColumnNameToIdx.end() && + oMapParquetColumnNameToIdx.find("bbox.miny") != + oMapParquetColumnNameToIdx.end() && + oMapParquetColumnNameToIdx.find("bbox.maxx") != + oMapParquetColumnNameToIdx.end() && + oMapParquetColumnNameToIdx.find("bbox.maxy") != + oMapParquetColumnNameToIdx.end()) + { + CPLJSONObject oDef; + CPLJSONObject oCovering; + oDef.Add("covering", oCovering); + CPLJSONObject oBBOX; + oCovering.Add("bbox", oBBOX); + { + CPLJSONArray oArray; + oArray.Add("bbox"); + oArray.Add("minx"); + oBBOX.Add("xmin", oArray); + } + { + CPLJSONArray oArray; + oArray.Add("bbox"); + oArray.Add("miny"); + oBBOX.Add("ymin", oArray); + } + { + CPLJSONArray oArray; + oArray.Add("bbox"); + oArray.Add("maxx"); + oBBOX.Add("xmax", oArray); + } + { + CPLJSONArray oArray; + oArray.Add("bbox"); + oArray.Add("maxy"); + oBBOX.Add("ymax", oArray); + } + oSetBBOXColumns.insert("bbox"); + oDef.Add("encoding", "WKB"); + m_oMapGeometryColumns["geometry"] = oDef; + } + int iParquetCol = 0; for (int i = 0; i < m_poSchema->num_fields(); ++i) { @@ -483,6 +623,14 @@ void OGRParquetLayer::EstablishFeatureDefn() continue; } + if (oSetBBOXColumns.find(field->name()) != oSetBBOXColumns.end()) + { + m_oSetBBoxArrowColumns.insert(i); + if (bParquetColValid) + iParquetCol++; + continue; + } + const auto ComputeGeometryColumnTypeLambda = [this, bParquetColValid, iParquetCol, &poParquetSchema]() { @@ -501,6 +649,13 @@ void OGRParquetLayer::EstablishFeatureDefn() DealWithGeometryColumn(i, field, ComputeGeometryColumnTypeLambda); if (bGeometryField) { + const auto oIter = m_oMapGeometryColumns.find(field->name()); + if (bUseBBOX && oIter != m_oMapGeometryColumns.end()) + { + ProcessGeometryColumnCovering(field, oIter->second, + oMapParquetColumnNameToIdx); + } + m_anMapGeomFieldIndexToParquetColumn.push_back( bParquetColValid ? iParquetCol : -1); if (bParquetColValid) @@ -542,6 +697,100 @@ void OGRParquetLayer::EstablishFeatureDefn() } } +/************************************************************************/ +/* ProcessGeometryColumnCovering() */ +/************************************************************************/ + +/** Process GeoParquet JSON geometry field object to extract information about + * its bounding box column, and appropriately fill m_oMapGeomFieldIndexToGeomColBBOX + * and m_oMapGeomFieldIndexToGeomColBBOXParquet members with information on that + * bounding box column. + */ +void OGRParquetLayer::ProcessGeometryColumnCovering( + const std::shared_ptr &field, + const CPLJSONObject &oJSONGeometryColumn, + const std::map &oMapParquetColumnNameToIdx) +{ + std::string osBBOXColumn; + std::string osXMin, osYMin, osXMax, osYMax; + if (ParseGeometryColumnCovering(oJSONGeometryColumn, osBBOXColumn, osXMin, + osYMin, osXMax, osYMax)) + { + OGRArrowLayer::GeomColBBOX sDesc; + sDesc.iArrowCol = m_poSchema->GetFieldIndex(osBBOXColumn); + const auto fieldBBOX = m_poSchema->GetFieldByName(osBBOXColumn); + if (sDesc.iArrowCol >= 0 && fieldBBOX && + fieldBBOX->type()->id() == arrow::Type::STRUCT) + { + const auto fieldBBOXStruct = + std::static_pointer_cast(fieldBBOX->type()); + const auto fieldXMin = fieldBBOXStruct->GetFieldByName(osXMin); + const auto fieldYMin = fieldBBOXStruct->GetFieldByName(osYMin); + const auto fieldXMax = fieldBBOXStruct->GetFieldByName(osXMax); + const auto fieldYMax = fieldBBOXStruct->GetFieldByName(osYMax); + const int nXMinIdx = fieldBBOXStruct->GetFieldIndex(osXMin); + const int nYMinIdx = fieldBBOXStruct->GetFieldIndex(osYMin); + const int nXMaxIdx = fieldBBOXStruct->GetFieldIndex(osXMax); + const int nYMaxIdx = fieldBBOXStruct->GetFieldIndex(osYMax); + const auto oIterParquetIdxXMin = oMapParquetColumnNameToIdx.find( + std::string(osBBOXColumn).append(".").append(osXMin)); + const auto oIterParquetIdxYMin = oMapParquetColumnNameToIdx.find( + std::string(osBBOXColumn).append(".").append(osYMin)); + const auto oIterParquetIdxXMax = oMapParquetColumnNameToIdx.find( + std::string(osBBOXColumn).append(".").append(osXMax)); + const auto oIterParquetIdxYMax = oMapParquetColumnNameToIdx.find( + std::string(osBBOXColumn).append(".").append(osYMax)); + if (nXMinIdx >= 0 && nYMinIdx >= 0 && nXMaxIdx >= 0 && + nYMaxIdx >= 0 && fieldXMin && fieldYMin && fieldXMax && + fieldYMax && + oIterParquetIdxXMin != oMapParquetColumnNameToIdx.end() && + oIterParquetIdxYMin != oMapParquetColumnNameToIdx.end() && + oIterParquetIdxXMax != oMapParquetColumnNameToIdx.end() && + oIterParquetIdxYMax != oMapParquetColumnNameToIdx.end() && + (fieldXMin->type()->id() == arrow::Type::FLOAT || + fieldXMin->type()->id() == arrow::Type::DOUBLE) && + fieldXMin->type()->id() == fieldYMin->type()->id() && + fieldXMin->type()->id() == fieldXMax->type()->id() && + fieldXMin->type()->id() == fieldYMax->type()->id()) + { + CPLDebug("PARQUET", + "Bounding box column '%s' detected for " + "geometry column '%s'", + osBBOXColumn.c_str(), field->name().c_str()); + sDesc.iArrowSubfieldXMin = nXMinIdx; + sDesc.iArrowSubfieldYMin = nYMinIdx; + sDesc.iArrowSubfieldXMax = nXMaxIdx; + sDesc.iArrowSubfieldYMax = nYMaxIdx; + sDesc.bIsFloat = + (fieldXMin->type()->id() == arrow::Type::FLOAT); + + m_oMapGeomFieldIndexToGeomColBBOX + [m_poFeatureDefn->GetGeomFieldCount() - 1] = + std::move(sDesc); + + GeomColBBOXParquet sDescParquet; + sDescParquet.iParquetXMin = oIterParquetIdxXMin->second; + sDescParquet.iParquetYMin = oIterParquetIdxYMin->second; + sDescParquet.iParquetXMax = oIterParquetIdxXMax->second; + sDescParquet.iParquetYMax = oIterParquetIdxYMax->second; + for (const auto &iterParquetCols : oMapParquetColumnNameToIdx) + { + if (STARTS_WITH( + iterParquetCols.first.c_str(), + std::string(osBBOXColumn).append(".").c_str())) + { + sDescParquet.anParquetCols.push_back( + iterParquetCols.second); + } + } + m_oMapGeomFieldIndexToGeomColBBOXParquet + [m_poFeatureDefn->GetGeomFieldCount() - 1] = + std::move(sDescParquet); + } + } + } +} + /************************************************************************/ /* CheckMatchArrowParquetColumnNames() */ /************************************************************************/ @@ -1054,9 +1303,12 @@ bool OGRParquetLayer::ReadNextBatch() bool bIterateEverything = false; std::vector anSelectedGroups; + const auto oIterToGeomColBBOX = + m_oMapGeomFieldIndexToGeomColBBOXParquet.find(m_iGeomFieldFilter); const bool bUSEBBOXFields = - (m_poFilterGeom && m_iBBOXMinXField >= 0 && m_iBBOXMinYField >= 0 && - m_iBBOXMaxXField >= 0 && m_iBBOXMaxYField >= 0 && + (m_poFilterGeom && + oIterToGeomColBBOX != + m_oMapGeomFieldIndexToGeomColBBOXParquet.end() && CPLTestBool(CPLGetConfigOption( ("OGR_" + GetDriverUCName() + "_USE_BBOX").c_str(), "YES"))); @@ -1087,10 +1339,10 @@ bool OGRParquetLayer::ReadNextBatch() if (bUSEBBOXFields) { - if (GetMinMaxForField(iRowGroup, m_iBBOXMinXField, true, - sMin, bFoundMin, false, sMax, - bFoundMax, eType, eSubType, osMinTmp, - osMaxTmp) && + if (GetMinMaxForParquetCol( + iRowGroup, oIterToGeomColBBOX->second.iParquetXMin, + nullptr, true, sMin, bFoundMin, false, sMax, + bFoundMax, eType, eSubType, osMinTmp, osMaxTmp) && bFoundMin && eType == OFTReal) { const double dfGroupMinX = sMin.Real; @@ -1098,10 +1350,12 @@ bool OGRParquetLayer::ReadNextBatch() { bSelectGroup = false; } - else if (GetMinMaxForField( - iRowGroup, m_iBBOXMinYField, true, sMin, - bFoundMin, false, sMax, bFoundMax, eType, - eSubType, osMinTmp, osMaxTmp) && + else if (GetMinMaxForParquetCol( + iRowGroup, + oIterToGeomColBBOX->second.iParquetYMin, + nullptr, true, sMin, bFoundMin, false, + sMax, bFoundMax, eType, eSubType, osMinTmp, + osMaxTmp) && bFoundMin && eType == OFTReal) { const double dfGroupMinY = sMin.Real; @@ -1109,10 +1363,13 @@ bool OGRParquetLayer::ReadNextBatch() { bSelectGroup = false; } - else if (GetMinMaxForField( - iRowGroup, m_iBBOXMaxXField, false, - sMin, bFoundMin, true, sMax, bFoundMax, - eType, eSubType, osMinTmp, osMaxTmp) && + else if (GetMinMaxForParquetCol( + iRowGroup, + oIterToGeomColBBOX->second + .iParquetXMax, + nullptr, false, sMin, bFoundMin, true, + sMax, bFoundMax, eType, eSubType, + osMinTmp, osMaxTmp) && bFoundMax && eType == OFTReal) { const double dfGroupMaxX = sMax.Real; @@ -1120,11 +1377,13 @@ bool OGRParquetLayer::ReadNextBatch() { bSelectGroup = false; } - else if (GetMinMaxForField( - iRowGroup, m_iBBOXMaxYField, false, - sMin, bFoundMin, true, sMax, - bFoundMax, eType, eSubType, - osMinTmp, osMaxTmp) && + else if (GetMinMaxForParquetCol( + iRowGroup, + oIterToGeomColBBOX->second + .iParquetYMax, + nullptr, false, sMin, bFoundMin, + true, sMax, bFoundMax, eType, + eSubType, osMinTmp, osMaxTmp) && bFoundMax && eType == OFTReal) { const double dfGroupMaxY = sMax.Real; @@ -1160,7 +1419,7 @@ bool OGRParquetLayer::ReadNextBatch() poRowGroup->metadata()->num_rows() - 1; eType = OFTInteger64; } - else if (!GetMinMaxForField( + else if (!GetMinMaxForOGRField( iRowGroup, iOGRField, true, sMin, bFoundMin, true, sMax, bFoundMax, eType, eSubType, osMinTmp, osMaxTmp) || @@ -1355,7 +1614,8 @@ bool OGRParquetLayer::ReadNextBatch() } if (poNextBatch == nullptr) { - if (m_iRecordBatch == 1) + if (m_iRecordBatch == 1 && m_poBatch && m_poAttrQuery == nullptr && + m_poFilterGeom == nullptr) { m_iRecordBatch = 0; m_bSingleBatch = true; @@ -1560,6 +1820,20 @@ OGRErr OGRParquetLayer::SetIgnoredFields(const char **papszFields) m_anMapGeomFieldIndexToArrayIndex.push_back(nBatchColumns); nBatchColumns++; m_anRequestedParquetColumns.push_back(iParquetCol); + + auto oIter = m_oMapGeomFieldIndexToGeomColBBOX.find(i); + const auto oIterParquet = + m_oMapGeomFieldIndexToGeomColBBOXParquet.find(i); + if (oIter != m_oMapGeomFieldIndexToGeomColBBOX.end() && + oIterParquet != + m_oMapGeomFieldIndexToGeomColBBOXParquet.end()) + { + oIter->second.iArrayIdx = nBatchColumns++; + m_anRequestedParquetColumns.insert( + m_anRequestedParquetColumns.end(), + oIterParquet->second.anParquetCols.begin(), + oIterParquet->second.anParquetCols.end()); + } } else { @@ -1608,61 +1882,57 @@ bool OGRParquetLayer::FastGetExtent(int iGeomField, OGREnvelope *psExtent) const if (OGRParquetLayerBase::FastGetExtent(iGeomField, psExtent)) return true; - if (iGeomField == 0 && m_poFeatureDefn->GetGeomFieldCount() == 1) + const auto oIterToGeomColBBOX = + m_oMapGeomFieldIndexToGeomColBBOXParquet.find(iGeomField); + if (oIterToGeomColBBOX != m_oMapGeomFieldIndexToGeomColBBOXParquet.end() && + CPLTestBool(CPLGetConfigOption("OGR_PARQUET_USE_BBOX", "YES"))) { - // OuvertureMaps dataset have double bbox.minx, bbox.miny, bbox.maxx, - // bboxy.maxy fields with statistics. Use that to quickly compute - // extent. - if (m_iBBOXMinXField >= 0 && m_iBBOXMinYField >= 0 && - m_iBBOXMaxXField >= 0 && m_iBBOXMaxYField >= 0 && - CPLTestBool(CPLGetConfigOption("OGR_PARQUET_USE_BBOX", "YES"))) - { - OGREnvelope sExtent; - OGRField sMin, sMax; - OGR_RawField_SetNull(&sMin); - OGR_RawField_SetNull(&sMax); - bool bFoundMin, bFoundMax; - OGRFieldType eType = OFTMaxType; - OGRFieldSubType eSubType = OFSTNone; - std::string osMinTmp, osMaxTmp; - if (GetMinMaxForField(-1, m_iBBOXMinXField, true, sMin, bFoundMin, - false, sMax, bFoundMax, eType, eSubType, - osMinTmp, osMaxTmp) && + OGREnvelope sExtent; + OGRField sMin, sMax; + OGR_RawField_SetNull(&sMin); + OGR_RawField_SetNull(&sMax); + bool bFoundMin, bFoundMax; + OGRFieldType eType = OFTMaxType; + OGRFieldSubType eSubType = OFSTNone; + std::string osMinTmp, osMaxTmp; + if (GetMinMaxForParquetCol(-1, oIterToGeomColBBOX->second.iParquetXMin, + nullptr, true, sMin, bFoundMin, false, sMax, + bFoundMax, eType, eSubType, osMinTmp, + osMaxTmp) && + eType == OFTReal) + { + sExtent.MinX = sMin.Real; + + if (GetMinMaxForParquetCol( + -1, oIterToGeomColBBOX->second.iParquetYMin, nullptr, true, + sMin, bFoundMin, false, sMax, bFoundMax, eType, eSubType, + osMinTmp, osMaxTmp) && eType == OFTReal) { - sExtent.MinX = sMin.Real; + sExtent.MinY = sMin.Real; - if (GetMinMaxForField(-1, m_iBBOXMinYField, true, sMin, - bFoundMin, false, sMax, bFoundMax, eType, - eSubType, osMinTmp, osMaxTmp) && + if (GetMinMaxForParquetCol( + -1, oIterToGeomColBBOX->second.iParquetXMax, nullptr, + false, sMin, bFoundMin, true, sMax, bFoundMax, eType, + eSubType, osMinTmp, osMaxTmp) && eType == OFTReal) { - sExtent.MinY = sMin.Real; + sExtent.MaxX = sMax.Real; - if (GetMinMaxForField(-1, m_iBBOXMaxXField, false, sMin, - bFoundMin, true, sMax, bFoundMax, - eType, eSubType, osMinTmp, - osMaxTmp) && + if (GetMinMaxForParquetCol( + -1, oIterToGeomColBBOX->second.iParquetYMax, + nullptr, false, sMin, bFoundMin, true, sMax, + bFoundMax, eType, eSubType, osMinTmp, osMaxTmp) && eType == OFTReal) { - sExtent.MaxX = sMax.Real; - - if (GetMinMaxForField(-1, m_iBBOXMaxYField, false, sMin, - bFoundMin, true, sMax, bFoundMax, - eType, eSubType, osMinTmp, - osMaxTmp) && - eType == OFTReal) - { - sExtent.MaxY = sMax.Real; - - CPLDebug( - "PARQUET", - "Using statistics of bbox.minx, bbox.miny, " - "bbox.maxx, bbox.maxy columns to get extent"); - m_oMapExtents[iGeomField] = sExtent; - *psExtent = sExtent; - return true; - } + sExtent.MaxY = sMax.Real; + + CPLDebug("PARQUET", + "Using statistics of bbox.minx, bbox.miny, " + "bbox.maxx, bbox.maxy columns to get extent"); + m_oMapExtents[iGeomField] = sExtent; + *psExtent = sExtent; + return true; } } } @@ -1918,6 +2188,11 @@ template struct GetStats v = rowGroupVal; } } + else if (columnChunk->num_values() > 0) + { + bFound = false; + break; + } } return v; } @@ -1945,7 +2220,7 @@ template struct GetStats v = rowGroupVal; } } - else + else if (columnChunk->num_values() > 0) { bFound = false; break; @@ -2028,25 +2303,18 @@ template <> struct GetStats }; /************************************************************************/ -/* GetMinMaxForField() */ +/* GetMinMaxForOGRField() */ /************************************************************************/ -bool OGRParquetLayer::GetMinMaxForField(int iRowGroup, // -1 for all - int iOGRField, bool bComputeMin, - OGRField &sMin, bool &bFoundMin, - bool bComputeMax, OGRField &sMax, - bool &bFoundMax, OGRFieldType &eType, - OGRFieldSubType &eSubType, - std::string &osMinTmp, - std::string &osMaxTmp) const +bool OGRParquetLayer::GetMinMaxForOGRField(int iRowGroup, // -1 for all + int iOGRField, bool bComputeMin, + OGRField &sMin, bool &bFoundMin, + bool bComputeMax, OGRField &sMax, + bool &bFoundMax, OGRFieldType &eType, + OGRFieldSubType &eSubType, + std::string &osMinTmp, + std::string &osMaxTmp) const { - const OGRFieldDefn oDummyFIDFieldDefn(m_osFIDColumn.c_str(), OFTInteger64); - const OGRFieldDefn *poFieldDefn = - iOGRField == OGR_FID_INDEX - ? &oDummyFIDFieldDefn - : const_cast(this)->GetLayerDefn()->GetFieldDefn( - iOGRField); - OGR_RawField_SetNull(&sMin); OGR_RawField_SetNull(&sMax); eType = OFTReal; @@ -2059,21 +2327,85 @@ bool OGRParquetLayer::GetMinMaxForField(int iRowGroup, // -1 for all : GetMapFieldIndexToParquetColumn()[iOGRField]; if (iCol < 0) return false; - const auto metadata = GetReader()->parquet_reader()->metadata(); - const auto numRowGroups = metadata->num_row_groups(); const auto &arrowType = iOGRField == OGR_FID_INDEX ? m_poFIDType : GetArrowFieldTypes()[iOGRField]; + const bool bRet = GetMinMaxForParquetCol( + iRowGroup, iCol, arrowType, bComputeMin, sMin, bFoundMin, bComputeMax, + sMax, bFoundMax, eType, eSubType, osMinTmp, osMaxTmp); + + if (eType == OFTInteger64 && arrowType->id() == arrow::Type::TIMESTAMP) + { + const OGRFieldDefn oDummyFIDFieldDefn(m_osFIDColumn.c_str(), + OFTInteger64); + const OGRFieldDefn *poFieldDefn = + iOGRField == OGR_FID_INDEX ? &oDummyFIDFieldDefn + : const_cast(this) + ->GetLayerDefn() + ->GetFieldDefn(iOGRField); + if (poFieldDefn->GetType() == OFTDateTime) + { + const auto timestampType = + static_cast(arrowType.get()); + if (bFoundMin) + { + const int64_t timestamp = sMin.Integer64; + OGRArrowLayer::TimestampToOGR(timestamp, timestampType, + poFieldDefn->GetTZFlag(), &sMin); + } + if (bFoundMax) + { + const int64_t timestamp = sMax.Integer64; + OGRArrowLayer::TimestampToOGR(timestamp, timestampType, + poFieldDefn->GetTZFlag(), &sMax); + } + eType = OFTDateTime; + } + } + + return bRet; +} + +/************************************************************************/ +/* GetMinMaxForParquetCol() */ +/************************************************************************/ + +bool OGRParquetLayer::GetMinMaxForParquetCol( + int iRowGroup, // -1 for all + int iCol, + const std::shared_ptr &arrowType, // potentially nullptr + bool bComputeMin, OGRField &sMin, bool &bFoundMin, bool bComputeMax, + OGRField &sMax, bool &bFoundMax, OGRFieldType &eType, + OGRFieldSubType &eSubType, std::string &osMinTmp, + std::string &osMaxTmp) const +{ + OGR_RawField_SetNull(&sMin); + OGR_RawField_SetNull(&sMax); + eType = OFTReal; + eSubType = OFSTNone; + bFoundMin = false; + bFoundMax = false; + + const auto metadata = GetReader()->parquet_reader()->metadata(); + const auto numRowGroups = metadata->num_row_groups(); + if (numRowGroups == 0) return false; - const auto rowGroup0columnChunk = metadata->RowGroup(0)->ColumnChunk(iCol); + const auto rowGroup0 = metadata->RowGroup(0); + if (iCol < 0 || iCol >= rowGroup0->num_columns()) + { + CPLError(CE_Failure, CPLE_AppDefined, + "GetMinMaxForParquetCol(): invalid iCol=%d", iCol); + return false; + } + const auto rowGroup0columnChunk = rowGroup0->ColumnChunk(iCol); const auto rowGroup0Stats = rowGroup0columnChunk->statistics(); if (!(rowGroup0columnChunk->is_stats_set() && rowGroup0Stats)) { CPLDebug("PARQUET", "Statistics not available for field %s", - poFieldDefn->GetNameRef()); + rowGroup0columnChunk->path_in_schema()->ToDotString().c_str()); return false; } @@ -2090,7 +2422,7 @@ bool OGRParquetLayer::GetMinMaxForField(int iRowGroup, // -1 for all } else if (physicalType == parquet::Type::INT32) { - if (arrowType->id() == arrow::Type::UINT32) + if (arrowType && arrowType->id() == arrow::Type::UINT32) { // With parquet file version 2.0, // statistics of uint32 fields are @@ -2106,7 +2438,7 @@ bool OGRParquetLayer::GetMinMaxForField(int iRowGroup, // -1 for all else { eType = OFTInteger; - if (poFieldDefn->GetSubType() == OFSTInt16) + if (arrowType && arrowType->id() == arrow::Type::INT16) eSubType = OFSTInt16; sMin.Integer = GetStats::min( metadata, iRowGroup, numRowGroups, iCol, bFoundMin); @@ -2131,7 +2463,9 @@ bool OGRParquetLayer::GetMinMaxForField(int iRowGroup, // -1 for all sMin.Real = GetStats::min( metadata, iRowGroup, numRowGroups, iCol, bFoundMin); } - else if (poFieldDefn->GetType() == OFTString && + else if (arrowType && + (arrowType->id() == arrow::Type::STRING || + arrowType->id() == arrow::Type::LARGE_STRING) && physicalType == parquet::Type::BYTE_ARRAY) { osMinTmp = GetStats::min( @@ -2155,7 +2489,7 @@ bool OGRParquetLayer::GetMinMaxForField(int iRowGroup, // -1 for all } else if (physicalType == parquet::Type::INT32) { - if (arrowType->id() == arrow::Type::UINT32) + if (arrowType && arrowType->id() == arrow::Type::UINT32) { // With parquet file version 2.0, // statistics of uint32 fields are @@ -2171,7 +2505,7 @@ bool OGRParquetLayer::GetMinMaxForField(int iRowGroup, // -1 for all else { eType = OFTInteger; - if (poFieldDefn->GetSubType() == OFSTInt16) + if (arrowType && arrowType->id() == arrow::Type::INT16) eSubType = OFSTInt16; sMax.Integer = GetStats::max( metadata, iRowGroup, numRowGroups, iCol, bFoundMax); @@ -2196,7 +2530,9 @@ bool OGRParquetLayer::GetMinMaxForField(int iRowGroup, // -1 for all sMax.Real = GetStats::max( metadata, iRowGroup, numRowGroups, iCol, bFoundMax); } - else if (poFieldDefn->GetType() == OFTString && + else if (arrowType && + (arrowType->id() == arrow::Type::STRING || + arrowType->id() == arrow::Type::LARGE_STRING) && physicalType == parquet::Type::BYTE_ARRAY) { osMaxTmp = GetStats::max( @@ -2209,25 +2545,30 @@ bool OGRParquetLayer::GetMinMaxForField(int iRowGroup, // -1 for all } } - if (eType == OFTInteger64 && poFieldDefn->GetType() == OFTDateTime && - arrowType->id() == arrow::Type::TIMESTAMP) + return bFoundMin || bFoundMax; +} + +/************************************************************************/ +/* GeomColsBBOXParquet() */ +/************************************************************************/ + +/** Return for a given geometry column (iGeom: in [0, GetGeomFieldCount()-1] range), + * the Parquet column number of the corresponding xmin,ymin,xmax,ymax bounding + * box columns, if existing. + */ +bool OGRParquetLayer::GeomColsBBOXParquet(int iGeom, int &iParquetXMin, + int &iParquetYMin, int &iParquetXMax, + int &iParquetYMax) const +{ + const auto oIter = m_oMapGeomFieldIndexToGeomColBBOXParquet.find(iGeom); + const bool bFound = + (oIter != m_oMapGeomFieldIndexToGeomColBBOXParquet.end()); + if (bFound) { - const auto timestampType = - static_cast(arrowType.get()); - if (bFoundMin) - { - const int64_t timestamp = sMin.Integer64; - OGRArrowLayer::TimestampToOGR(timestamp, timestampType, - poFieldDefn->GetTZFlag(), &sMin); - } - if (bFoundMax) - { - const int64_t timestamp = sMax.Integer64; - OGRArrowLayer::TimestampToOGR(timestamp, timestampType, - poFieldDefn->GetTZFlag(), &sMax); - } - eType = OFTDateTime; + iParquetXMin = oIter->second.iParquetXMin; + iParquetYMin = oIter->second.iParquetYMin; + iParquetXMax = oIter->second.iParquetXMax; + iParquetYMax = oIter->second.iParquetYMax; } - - return bFoundMin || bFoundMax; + return bFound; } diff --git a/ogr/ogrsf_frmts/parquet/ogrparquetwriterdataset.cpp b/ogr/ogrsf_frmts/parquet/ogrparquetwriterdataset.cpp index 545b45a420b8..f970c001a243 100644 --- a/ogr/ogrsf_frmts/parquet/ogrparquetwriterdataset.cpp +++ b/ogr/ogrsf_frmts/parquet/ogrparquetwriterdataset.cpp @@ -41,6 +41,29 @@ OGRParquetWriterDataset::OGRParquetWriterDataset( { } +/************************************************************************/ +/* Close() */ +/************************************************************************/ + +CPLErr OGRParquetWriterDataset::Close() +{ + CPLErr eErr = CE_None; + if (nOpenFlags != OPEN_FLAGS_CLOSED) + { + if (m_poLayer && !m_poLayer->Close()) + { + eErr = CE_Failure; + } + + if (GDALPamDataset::Close() != CE_None) + { + eErr = CE_Failure; + } + } + + return eErr; +} + /************************************************************************/ /* GetLayerCount() */ /************************************************************************/ diff --git a/ogr/ogrsf_frmts/parquet/ogrparquetwriterlayer.cpp b/ogr/ogrsf_frmts/parquet/ogrparquetwriterlayer.cpp index 3a81ae86640d..43b496932c5d 100644 --- a/ogr/ogrsf_frmts/parquet/ogrparquetwriterlayer.cpp +++ b/ogr/ogrsf_frmts/parquet/ogrparquetwriterlayer.cpp @@ -35,6 +35,8 @@ #include "ogr_wkb.h" +#include + /************************************************************************/ /* OGRParquetWriterLayer() */ /************************************************************************/ @@ -51,13 +53,235 @@ OGRParquetWriterLayer::OGRParquetWriterLayer( } /************************************************************************/ -/* ~OGRParquetWriterLayer() */ +/* Close() */ /************************************************************************/ -OGRParquetWriterLayer::~OGRParquetWriterLayer() +bool OGRParquetWriterLayer::Close() { + if (m_poTmpGPKGLayer) + { + if (!CopyTmpGpkgLayerToFinalFile()) + return false; + } + if (m_bInitializationOK) - FinalizeWriting(); + { + if (!FinalizeWriting()) + return false; + } + + return true; +} + +/************************************************************************/ +/* CopyTmpGpkgLayerToFinalFile() */ +/************************************************************************/ + +bool OGRParquetWriterLayer::CopyTmpGpkgLayerToFinalFile() +{ + if (!m_poTmpGPKGLayer) + { + return true; + } + + CPLDebug("PARQUET", "CopyTmpGpkgLayerToFinalFile(): start..."); + + VSIUnlink(m_poTmpGPKG->GetDescription()); + + OGRFeature oFeat(m_poFeatureDefn); + + // Interval in terms of features between 2 debug progress report messages + constexpr int PROGRESS_FC_INTERVAL = 100 * 1000; + + // First, write features without geometries + { + auto poTmpLayer = std::unique_ptr(m_poTmpGPKG->ExecuteSQL( + "SELECT serialized_feature FROM tmp WHERE fid NOT IN (SELECT id " + "FROM rtree_tmp_geom)", + nullptr, nullptr)); + if (!poTmpLayer) + return false; + for (const auto &poSrcFeature : poTmpLayer.get()) + { + int nBytesFeature = 0; + const GByte *pabyFeatureData = + poSrcFeature->GetFieldAsBinary(0, &nBytesFeature); + if (!oFeat.DeserializeFromBinary(pabyFeatureData, nBytesFeature)) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Cannot deserialize feature"); + return false; + } + if (OGRArrowWriterLayer::ICreateFeature(&oFeat) != OGRERR_NONE) + { + return false; + } + + if ((m_nFeatureCount % PROGRESS_FC_INTERVAL) == 0) + { + CPLDebugProgress( + "PARQUET", + "CopyTmpGpkgLayerToFinalFile(): %.02f%% progress", + 100.0 * double(m_nFeatureCount) / + double(m_nTmpFeatureCount)); + } + } + + if (!FlushFeatures()) + { + return false; + } + } + + // Now walk through the GPKG RTree for features with geometries + // Cf https://github.com/sqlite/sqlite/blob/master/ext/rtree/rtree.c + // for the description of the content of the rtree _node table + std::vector> aNodeNoDepthPair; + int nTreeDepth = 0; + // Queue the root node + aNodeNoDepthPair.emplace_back( + std::make_pair(/* nodeNo = */ 1, /* depth = */ 0)); + int nCountWrittenFeaturesSinceLastFlush = 0; + while (!aNodeNoDepthPair.empty()) + { + const auto &oLastPair = aNodeNoDepthPair.back(); + const int64_t nNodeNo = oLastPair.first; + const int nCurDepth = oLastPair.second; + //CPLDebug("PARQUET", "Reading nodeNode=%d, curDepth=%d", int(nNodeNo), nCurDepth); + aNodeNoDepthPair.pop_back(); + + auto poRTreeLayer = std::unique_ptr(m_poTmpGPKG->ExecuteSQL( + CPLSPrintf("SELECT data FROM rtree_tmp_geom_node WHERE nodeno " + "= " CPL_FRMT_GIB, + static_cast(nNodeNo)), + nullptr, nullptr)); + if (!poRTreeLayer) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Cannot read node " CPL_FRMT_GIB, + static_cast(nNodeNo)); + return false; + } + const auto poRTreeFeature = + std::unique_ptr(poRTreeLayer->GetNextFeature()); + if (!poRTreeFeature) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Cannot read node " CPL_FRMT_GIB, + static_cast(nNodeNo)); + return false; + } + + int nNodeBytes = 0; + const GByte *pabyNodeData = + poRTreeFeature->GetFieldAsBinary(0, &nNodeBytes); + constexpr int BLOB_HEADER_SIZE = 4; + if (nNodeBytes < BLOB_HEADER_SIZE) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Not enough bytes when reading node " CPL_FRMT_GIB, + static_cast(nNodeNo)); + return false; + } + if (nNodeNo == 1) + { + // Get the RTree depth from the root node + nTreeDepth = (pabyNodeData[0] << 8) | pabyNodeData[1]; + //CPLDebug("PARQUET", "nTreeDepth = %d", nTreeDepth); + } + + const int nCellCount = (pabyNodeData[2] << 8) | pabyNodeData[3]; + constexpr int SIZEOF_CELL = 24; // int64_t + 4 float + if (nNodeBytes < BLOB_HEADER_SIZE + SIZEOF_CELL * nCellCount) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Not enough bytes when reading node " CPL_FRMT_GIB, + static_cast(nNodeNo)); + return false; + } + + size_t nOffset = BLOB_HEADER_SIZE; + if (nCurDepth == nTreeDepth) + { + // Leaf node: it references feature IDs. + + // If we are about to go above m_nRowGroupSize, flush past + // features now, to improve the spatial compacity of the row group. + if (m_nRowGroupSize > nCellCount && + nCountWrittenFeaturesSinceLastFlush + nCellCount > + m_nRowGroupSize) + { + nCountWrittenFeaturesSinceLastFlush = 0; + if (!FlushFeatures()) + { + return false; + } + } + + for (int i = 0; i < nCellCount; ++i) + { + int64_t nFID; + memcpy(&nFID, pabyNodeData + nOffset, sizeof(int64_t)); + CPL_MSBPTR64(&nFID); + + const auto poSrcFeature = std::unique_ptr( + m_poTmpGPKGLayer->GetFeature(nFID)); + if (!poSrcFeature) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Cannot get feature " CPL_FRMT_GIB, + static_cast(nFID)); + return false; + } + + int nBytesFeature = 0; + const GByte *pabyFeatureData = + poSrcFeature->GetFieldAsBinary(0, &nBytesFeature); + if (!oFeat.DeserializeFromBinary(pabyFeatureData, + nBytesFeature)) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Cannot deserialize feature"); + return false; + } + if (OGRArrowWriterLayer::ICreateFeature(&oFeat) != OGRERR_NONE) + { + return false; + } + + nOffset += SIZEOF_CELL; + + ++nCountWrittenFeaturesSinceLastFlush; + + if ((m_nFeatureCount % PROGRESS_FC_INTERVAL) == 0 || + m_nFeatureCount == m_nTmpFeatureCount / 2) + { + CPLDebugProgress( + "PARQUET", + "CopyTmpGpkgLayerToFinalFile(): %.02f%% progress", + 100.0 * double(m_nFeatureCount) / + double(m_nTmpFeatureCount)); + } + } + } + else + { + // Non-leaf node: it references child nodes. + for (int i = 0; i < nCellCount; ++i) + { + int64_t nNode; + memcpy(&nNode, pabyNodeData + nOffset, sizeof(int64_t)); + CPL_MSBPTR64(&nNode); + aNodeNoDepthPair.emplace_back( + std::make_pair(nNode, nCurDepth + 1)); + nOffset += SIZEOF_CELL; + } + } + } + + CPLDebug("PARQUET", + "CopyTmpGpkgLayerToFinalFile(): 100%%, successfully finished"); + return true; } /************************************************************************/ @@ -95,6 +319,37 @@ bool OGRParquetWriterLayer::SetOptions(CSLConstList papszOptions, const OGRSpatialReference *poSpatialRef, OGRwkbGeometryType eGType) { + m_bWriteBBoxStruct = CPLTestBool(CSLFetchNameValueDef( + papszOptions, "WRITE_COVERING_BBOX", + CPLGetConfigOption("OGR_PARQUET_WRITE_COVERING_BBOX", "YES"))); + + if (CPLTestBool(CSLFetchNameValueDef(papszOptions, "SORT_BY_BBOX", "NO"))) + { + const std::string osTmpGPKG(std::string(m_poDataset->GetDescription()) + + ".tmp.gpkg"); + auto poGPKGDrv = GetGDALDriverManager()->GetDriverByName("GPKG"); + if (!poGPKGDrv) + { + CPLError( + CE_Failure, CPLE_AppDefined, + "Driver GPKG required for SORT_BY_BBOX layer creation option"); + return false; + } + m_poTmpGPKG.reset(poGPKGDrv->Create(osTmpGPKG.c_str(), 0, 0, 0, + GDT_Unknown, nullptr)); + if (!m_poTmpGPKG) + return false; + m_poTmpGPKG->MarkSuppressOnClose(); + m_poTmpGPKGLayer = m_poTmpGPKG->CreateLayer("tmp"); + if (!m_poTmpGPKGLayer) + return false; + // Serialized feature + m_poTmpGPKGLayer->CreateField( + std::make_unique("serialized_feature", OFTBinary) + .get()); + CPL_IGNORE_RET_VAL(m_poTmpGPKGLayer->StartTransaction()); + } + const char *pszGeomEncoding = CSLFetchNameValue(papszOptions, "GEOMETRY_ENCODING"); m_eGeomEncoding = OGRArrowGeomEncoding::WKB; @@ -230,7 +485,7 @@ bool OGRParquetWriterLayer::SetOptions(CSLConstList papszOptions, /* CloseFileWriter() */ /************************************************************************/ -void OGRParquetWriterLayer::CloseFileWriter() +bool OGRParquetWriterLayer::CloseFileWriter() { auto status = m_poFileWriter->Close(); if (!status.ok()) @@ -239,6 +494,7 @@ void OGRParquetWriterLayer::CloseFileWriter() "FileWriter::Close() failed with %s", status.message().c_str()); } + return status.ok(); } /************************************************************************/ @@ -477,6 +733,27 @@ std::string OGRParquetWriterLayer::GetGeoMetadata() const oColumn.Add("bbox", oBBOX); } + // Bounding box column definition + if (m_bWriteBBoxStruct) + { + CPLJSONObject oCovering; + oColumn.Add("covering", oCovering); + CPLJSONObject oBBOX; + oCovering.Add("bbox", oBBOX); + const auto AddComponent = + [this, i, &oBBOX](const char *pszComponent) + { + CPLJSONArray oArray; + oArray.Add(m_apoFieldsBBOX[i]->name()); + oArray.Add(pszComponent); + oBBOX.Add(pszComponent, oArray); + }; + AddComponent("xmin"); + AddComponent("ymin"); + AddComponent("xmax"); + AddComponent("ymax"); + } + const auto GetStringGeometryType = [](OGRwkbGeometryType eType) { const auto eFlattenType = wkbFlatten(eType); @@ -715,6 +992,68 @@ void OGRParquetWriterLayer::CreateWriter() &m_poKeyValueMetadata)); } +/************************************************************************/ +/* ICreateFeature() */ +/************************************************************************/ + +OGRErr OGRParquetWriterLayer::ICreateFeature(OGRFeature *poFeature) +{ + // If not using SORT_BY_BBOX=YES layer creation option, we can directly + // write features to the final Parquet file + if (!m_poTmpGPKGLayer) + return OGRArrowWriterLayer::ICreateFeature(poFeature); + + // SORT_BY_BBOX=YES case: we write for now a serialized version of poFeature + // in a temporary GeoPackage file. + + GIntBig nFID = poFeature->GetFID(); + if (!m_osFIDColumn.empty() && nFID == OGRNullFID) + { + nFID = m_nTmpFeatureCount; + poFeature->SetFID(nFID); + } + ++m_nTmpFeatureCount; + + std::vector abyBuffer; + // Serialize the source feature as a single array of bytes to preserve it + // fully + if (!poFeature->SerializeToBinary(abyBuffer)) + { + return OGRERR_FAILURE; + } + + // SQLite3 limitation: a row must fit in slightly less than 1 GB. + constexpr int SOME_MARGIN = 128; + if (abyBuffer.size() > 1024 * 1024 * 1024 - SOME_MARGIN) + { + CPLError(CE_Failure, CPLE_NotSupported, + "Features larger than 1 GB are not supported"); + return OGRERR_FAILURE; + } + + OGRFeature oFeat(m_poTmpGPKGLayer->GetLayerDefn()); + oFeat.SetFID(nFID); + oFeat.SetField(0, static_cast(abyBuffer.size()), abyBuffer.data()); + const auto poSrcGeom = poFeature->GetGeometryRef(); + if (poSrcGeom && !poSrcGeom->IsEmpty()) + { + // For the purpose of building an RTree, just use the bounding box of + // the geometry as the geometry. + OGREnvelope sEnvelope; + poSrcGeom->getEnvelope(&sEnvelope); + auto poPoly = std::make_unique(); + auto poLR = std::make_unique(); + poLR->addPoint(sEnvelope.MinX, sEnvelope.MinY); + poLR->addPoint(sEnvelope.MinX, sEnvelope.MaxY); + poLR->addPoint(sEnvelope.MaxX, sEnvelope.MaxY); + poLR->addPoint(sEnvelope.MaxX, sEnvelope.MinY); + poLR->addPoint(sEnvelope.MinX, sEnvelope.MinY); + poPoly->addRingDirectly(poLR.release()); + oFeat.SetGeometryDirectly(poPoly.release()); + } + return m_poTmpGPKGLayer->CreateFeature(&oFeat); +} + /************************************************************************/ /* FlushGroup() */ /************************************************************************/ @@ -726,7 +1065,7 @@ bool OGRParquetWriterLayer::FlushGroup() { CPLError(CE_Failure, CPLE_AppDefined, "NewRowGroup() failed with %s", status.message().c_str()); - m_apoBuilders.clear(); + ClearArrayBuilers(); return false; } @@ -745,7 +1084,7 @@ bool OGRParquetWriterLayer::FlushGroup() return true; }); - m_apoBuilders.clear(); + ClearArrayBuilers(); return ret; } @@ -807,6 +1146,15 @@ OGRParquetWriterLayer::WriteArrowBatch(const struct ArrowSchema *schema, struct ArrowArray *array, CSLConstList papszOptions) { + if (m_poTmpGPKGLayer) + { + // When using SORT_BY_BBOX=YES option, we can't directly write the + // input array, because we need to sort features. Hence we fallback + // to the OGRLayer base implementation, which will ultimately call + // OGRParquetWriterLayer::ICreateFeature() + return OGRLayer::WriteArrowBatch(schema, array, papszOptions); + } + return WriteArrowBatchInternal( schema, array, papszOptions, [this](const std::shared_ptr &poBatch) @@ -844,9 +1192,40 @@ inline int OGRParquetWriterLayer::TestCapability(const char *pszCap) if (EQUAL(pszCap, OLCFastWriteArrowBatch)) return false; #endif + + if (m_poTmpGPKGLayer && EQUAL(pszCap, OLCFastWriteArrowBatch)) + { + // When using SORT_BY_BBOX=YES option, we can't directly write the + // input array, because we need to sort features. So this is not + // fast + return false; + } + return OGRArrowWriterLayer::TestCapability(pszCap); } +/************************************************************************/ +/* CreateFieldFromArrowSchema() */ +/************************************************************************/ + +#if PARQUET_VERSION_MAJOR > 10 +bool OGRParquetWriterLayer::CreateFieldFromArrowSchema( + const struct ArrowSchema *schema, CSLConstList papszOptions) +{ + if (m_poTmpGPKGLayer) + { + // When using SORT_BY_BBOX=YES option, we can't directly write the + // input array, because we need to sort features. But this process + // only supports the base Arrow types supported by + // OGRLayer::WriteArrowBatch() + return OGRLayer::CreateFieldFromArrowSchema(schema, papszOptions); + } + + return OGRArrowWriterLayer::CreateFieldFromArrowSchema(schema, + papszOptions); +} +#endif + /************************************************************************/ /* IsArrowSchemaSupported() */ /************************************************************************/ @@ -856,6 +1235,16 @@ bool OGRParquetWriterLayer::IsArrowSchemaSupported( const struct ArrowSchema *schema, CSLConstList papszOptions, std::string &osErrorMsg) const { + if (m_poTmpGPKGLayer) + { + // When using SORT_BY_BBOX=YES option, we can't directly write the + // input array, because we need to sort features. But this process + // only supports the base Arrow types supported by + // OGRLayer::WriteArrowBatch() + return OGRLayer::IsArrowSchemaSupported(schema, papszOptions, + osErrorMsg); + } + if (schema->format[0] == 'e' && schema->format[1] == 0) { osErrorMsg = "float16 not supported"; diff --git a/port/cpl_error.cpp b/port/cpl_error.cpp index a08d55f2a181..fd522b8338d9 100644 --- a/port/cpl_error.cpp +++ b/port/cpl_error.cpp @@ -31,6 +31,10 @@ #include "cpl_error.h" +#ifndef _WIN32 +#include // isatty() +#endif + #include #include #include @@ -80,19 +84,22 @@ typedef struct CPLErrorHandlerNode *psHandlerStack; int nLastErrMsgMax; int nFailureIntoWarning; + bool bProgressMode; + bool bEmitNewlineBeforeNextDbgMsg; GUInt32 nErrorCounter; char szLastErrMsg[DEFAULT_LAST_ERR_MSG_SIZE]; // Do not add anything here. szLastErrMsg must be the last field. // See CPLRealloc() below. } CPLErrorContext; -constexpr CPLErrorContext sNoErrorContext = {0, CE_None, nullptr, 0, 0, 0, ""}; +constexpr CPLErrorContext sNoErrorContext = {0, CE_None, nullptr, 0, 0, + false, false, 0, ""}; constexpr CPLErrorContext sWarningContext = { - 0, CE_Warning, nullptr, 0, 0, 0, "A warning was emitted"}; + 0, CE_Warning, nullptr, 0, 0, false, false, 0, "A warning was emitted"}; constexpr CPLErrorContext sFailureContext = { - 0, CE_Warning, nullptr, 0, 0, 0, "A failure was emitted"}; + 0, CE_Warning, nullptr, 0, 0, false, false, 0, "A failure was emitted"}; #define IS_PREFEFINED_ERROR_CTX(psCtxt) \ (psCtx == &sNoErrorContext || psCtx == &sWarningContext || \ @@ -586,37 +593,14 @@ static int CPLGettimeofday(struct CPLTimeVal *tp, void * /* timezonep*/) #define CPLGettimeofday(t, u) gettimeofday(t, u) #endif +#ifndef WITHOUT_CPLDEBUG + /************************************************************************/ -/* CPLDebug() */ +/* CPLvDebug() */ /************************************************************************/ -/** - * Display a debugging message. - * - * The category argument is used in conjunction with the CPL_DEBUG - * environment variable to establish if the message should be displayed. - * If the CPL_DEBUG environment variable is not set, no debug messages - * are emitted (use CPLError(CE_Warning, ...) to ensure messages are displayed). - * If CPL_DEBUG is set, but is an empty string or the word "ON" then all - * debug messages are shown. Otherwise only messages whose category appears - * somewhere within the CPL_DEBUG value are displayed (as determined by - * strstr()). - * - * Categories are usually an identifier for the subsystem producing the - * error. For instance "GDAL" might be used for the GDAL core, and "TIFF" - * for messages from the TIFF translator. - * - * @param pszCategory name of the debugging message category. - * @param pszFormat printf() style format string for message to display. - * Remaining arguments are assumed to be for format. - */ - -#ifdef WITHOUT_CPLDEBUG -// Do not include CPLDebug. Only available in custom builds. -#else -void CPLDebug(const char *pszCategory, CPL_FORMAT_STRING(const char *pszFormat), - ...) - +static void CPLvDebug(const char *pszCategory, + CPL_FORMAT_STRING(const char *pszFormat), va_list args) { CPLErrorContext *psCtx = CPLGetErrorContext(); if (psCtx == nullptr || IS_PREFEFINED_ERROR_CTX(psCtx)) @@ -703,14 +687,9 @@ void CPLDebug(const char *pszCategory, CPL_FORMAT_STRING(const char *pszFormat), /* -------------------------------------------------------------------- */ /* Format the application provided portion of the debug message. */ /* -------------------------------------------------------------------- */ - va_list args; - va_start(args, pszFormat); - CPLvsnprintf(pszMessage + strlen(pszMessage), ERROR_MAX - strlen(pszMessage), pszFormat, args); - va_end(args); - /* -------------------------------------------------------------------- */ /* Obfuscate any password in error message */ /* -------------------------------------------------------------------- */ @@ -733,6 +712,100 @@ void CPLDebug(const char *pszCategory, CPL_FORMAT_STRING(const char *pszFormat), VSIFree(pszMessage); } + +#endif // !WITHOUT_CPLDEBUG + +/************************************************************************/ +/* CPLDebug() */ +/************************************************************************/ + +/** + * Display a debugging message. + * + * The category argument is used in conjunction with the CPL_DEBUG + * environment variable to establish if the message should be displayed. + * If the CPL_DEBUG environment variable is not set, no debug messages + * are emitted (use CPLError(CE_Warning, ...) to ensure messages are displayed). + * If CPL_DEBUG is set, but is an empty string or the word "ON" then all + * debug messages are shown. Otherwise only messages whose category appears + * somewhere within the CPL_DEBUG value are displayed (as determined by + * strstr()). + * + * Categories are usually an identifier for the subsystem producing the + * error. For instance "GDAL" might be used for the GDAL core, and "TIFF" + * for messages from the TIFF translator. + * + * @param pszCategory name of the debugging message category. + * @param pszFormat printf() style format string for message to display. + * Remaining arguments are assumed to be for format. + */ + +#ifdef WITHOUT_CPLDEBUG +// Do not include CPLDebug. Only available in custom builds. +#else + +void CPLDebug(const char *pszCategory, CPL_FORMAT_STRING(const char *pszFormat), + ...) + +{ + va_list args; + va_start(args, pszFormat); + CPLvDebug(pszCategory, pszFormat, args); + va_end(args); +} + +#endif // WITHOUT_CPLDEBUG + +/************************************************************************/ +/* CPLDebugProgress() */ +/************************************************************************/ + +/** + * Display a debugging message indicating a progression. + * + * This is the same as CPLDebug(), except that when displaying on the terminal, + * it will erase the previous debug progress message. This is for example + * appropriate to display increasing percentages for a task. + * + * The category argument is used in conjunction with the CPL_DEBUG + * environment variable to establish if the message should be displayed. + * If the CPL_DEBUG environment variable is not set, no debug messages + * are emitted (use CPLError(CE_Warning, ...) to ensure messages are displayed). + * If CPL_DEBUG is set, but is an empty string or the word "ON" then all + * debug messages are shown. Otherwise only messages whose category appears + * somewhere within the CPL_DEBUG value are displayed (as determined by + * strstr()). + * + * Categories are usually an identifier for the subsystem producing the + * error. For instance "GDAL" might be used for the GDAL core, and "TIFF" + * for messages from the TIFF translator. + * + * @param pszCategory name of the debugging message category. + * @param pszFormat printf() style format string for message to display. + * Remaining arguments are assumed to be for format. + * @since 3.9 + */ + +#ifdef WITHOUT_CPLDEBUG +// Do not include CPLDebugProgress. Only available in custom builds. +#else +void CPLDebugProgress(const char *pszCategory, + CPL_FORMAT_STRING(const char *pszFormat), ...) + +{ + CPLErrorContext *psCtx = CPLGetErrorContext(); + if (psCtx == nullptr || IS_PREFEFINED_ERROR_CTX(psCtx)) + return; + + psCtx->bProgressMode = true; + + va_list args; + va_start(args, pszFormat); + CPLvDebug(pszCategory, pszFormat, args); + va_end(args); + + psCtx->bProgressMode = false; +} #endif // !WITHOUT_CPLDEBUG /********************************************************************** @@ -983,7 +1056,36 @@ void CPL_STDCALL CPLDefaultErrorHandler(CPLErr eErrClass, CPLErrorNum nError, } if (eErrClass == CE_Debug) - fprintf(fpLog, "%s\n", pszErrorMsg); + { +#ifndef _WIN32 + CPLErrorContext *psCtx = CPLGetErrorContext(); + if (psCtx != nullptr && !IS_PREFEFINED_ERROR_CTX(psCtx) && + fpLog == stderr && isatty(static_cast(fileno(stderr)))) + { + if (psCtx->bProgressMode) + { + // Erase the content of the current line + fprintf(stderr, "\r"); + fprintf(stderr, "%s", pszErrorMsg); + fflush(stderr); + psCtx->bEmitNewlineBeforeNextDbgMsg = true; + } + else + { + if (psCtx->bEmitNewlineBeforeNextDbgMsg) + { + psCtx->bEmitNewlineBeforeNextDbgMsg = false; + fprintf(fpLog, "\n"); + } + fprintf(fpLog, "%s\n", pszErrorMsg); + } + } + else +#endif + { + fprintf(fpLog, "%s\n", pszErrorMsg); + } + } else if (eErrClass == CE_Warning) fprintf(fpLog, "Warning %d: %s\n", nError, pszErrorMsg); else diff --git a/port/cpl_error.h b/port/cpl_error.h index 91d27ca5430e..de376ac9d0ae 100644 --- a/port/cpl_error.h +++ b/port/cpl_error.h @@ -180,10 +180,13 @@ void CPL_DLL CPL_STDCALL CPLSetCurrentErrorHandlerCatchDebug(int bCatchDebug); void CPL_DLL CPL_STDCALL CPLPopErrorHandler(void); #ifdef WITHOUT_CPLDEBUG -#define CPLDebug(...) /* Eat all CPLDebug calls. */ +#define CPLDebug(...) /* Eat all CPLDebug calls. */ +#define CPLDebugProgress(...) /* Eat all CPLDebugProgress calls. */ #else void CPL_DLL CPLDebug(const char *, CPL_FORMAT_STRING(const char *), ...) CPL_PRINT_FUNC_FORMAT(2, 3); +void CPL_DLL CPLDebugProgress(const char *, CPL_FORMAT_STRING(const char *), + ...) CPL_PRINT_FUNC_FORMAT(2, 3); #endif #ifdef DEBUG diff --git a/swig/include/ogr.i b/swig/include/ogr.i index b5fe53e35444..a085e8063541 100644 --- a/swig/include/ogr.i +++ b/swig/include/ogr.i @@ -511,6 +511,7 @@ typedef void retGetPoints; %constant char *OLCZGeometries = "ZGeometries"; %constant char *OLCRename = "Rename"; %constant char *OLCFastGetArrowStream = "FastGetArrowStream"; +%constant char *OLCFastWriteArrowBatch = "FastWriteArrowBatch"; %constant char *ODsCCreateLayer = "CreateLayer"; %constant char *ODsCDeleteLayer = "DeleteLayer"; @@ -568,6 +569,7 @@ typedef int OGRErr; #define OLCZGeometries "ZGeometries" #define OLCRename "Rename" #define OLCFastGetArrowStream "FastGetArrowStream" +#define OLCFastWriteArrowBatch "FastWriteArrowBatch" #define ODsCCreateLayer "CreateLayer" #define ODsCDeleteLayer "DeleteLayer"