diff --git a/CHANGELOG.md b/CHANGELOG.md index d0b24320ea..c99dd12bde 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed +- Expose Charset in a ShapeFileReader API [#3464](https://github.com/locationtech/geotrellis/pull/3464) + ## [3.6.2] - 2022-04-05 ### Changed diff --git a/shapefile/data/shapefiles/demographics-utf8/demographics.dbf b/shapefile/data/shapefiles/demographics-utf8/demographics.dbf new file mode 100644 index 0000000000..e5f10752b0 Binary files /dev/null and b/shapefile/data/shapefiles/demographics-utf8/demographics.dbf differ diff --git a/shapefile/data/shapefiles/demographics-utf8/demographics.prj b/shapefile/data/shapefiles/demographics-utf8/demographics.prj new file mode 100644 index 0000000000..a04443eaa5 --- /dev/null +++ b/shapefile/data/shapefiles/demographics-utf8/demographics.prj @@ -0,0 +1 @@ +PROJCS["WGS_84_World_Mercator",GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]],PROJECTION["Mercator"],PARAMETER["central_meridian",0],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["Meter",1],PARAMETER["standard_parallel_1",0.0]] \ No newline at end of file diff --git a/shapefile/data/shapefiles/demographics-utf8/demographics.qpj b/shapefile/data/shapefiles/demographics-utf8/demographics.qpj new file mode 100644 index 0000000000..661e4cb11e --- /dev/null +++ b/shapefile/data/shapefiles/demographics-utf8/demographics.qpj @@ -0,0 +1 @@ +PROJCS["WGS 84 / World Mercator",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Mercator_1SP"],PARAMETER["central_meridian",0],PARAMETER["scale_factor",1],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH],AUTHORITY["EPSG","3395"]] diff --git a/shapefile/data/shapefiles/demographics-utf8/demographics.shp b/shapefile/data/shapefiles/demographics-utf8/demographics.shp new file mode 100644 index 0000000000..6426f75418 Binary files /dev/null and b/shapefile/data/shapefiles/demographics-utf8/demographics.shp differ diff --git a/shapefile/data/shapefiles/demographics-utf8/demographics.shx b/shapefile/data/shapefiles/demographics-utf8/demographics.shx new file mode 100644 index 0000000000..3d29e2d541 Binary files /dev/null and b/shapefile/data/shapefiles/demographics-utf8/demographics.shx differ diff --git a/shapefile/src/main/scala/geotrellis/shapefile/ShapeFileReader.scala b/shapefile/src/main/scala/geotrellis/shapefile/ShapeFileReader.scala index 9af0551d46..33fca32500 100644 --- a/shapefile/src/main/scala/geotrellis/shapefile/ShapeFileReader.scala +++ b/shapefile/src/main/scala/geotrellis/shapefile/ShapeFileReader.scala @@ -24,11 +24,13 @@ import org.geotools.data.shapefile._ import java.net.URL import java.io.File +import java.nio.charset.Charset import scala.collection.mutable import scala.collection.JavaConverters._ object ShapeFileReader { + val DEFAULT_CHARSET = Charset.forName("ISO-8859-1") implicit class SimpleFeatureWrapper(ft: SimpleFeature) { def geom[G <: Geometry: Manifest]: Option[G] = ft.getAttribute(0) match { @@ -45,11 +47,15 @@ object ShapeFileReader { ft.getAttribute(name).asInstanceOf[D] } - def readSimpleFeatures(path: String): Seq[SimpleFeature] = readSimpleFeatures(new URL(s"file://${new File(path).getAbsolutePath}")) + // TODO: use default argument instead of overloads in the next major release + def readSimpleFeatures(path: String): Seq[SimpleFeature] = readSimpleFeatures(new URL(s"file://${new File(path).getAbsolutePath}"), DEFAULT_CHARSET) + def readSimpleFeatures(path: String, charSet: Charset): Seq[SimpleFeature] = readSimpleFeatures(new URL(s"file://${new File(path).getAbsolutePath}"), charSet) - def readSimpleFeatures(url: URL): Seq[SimpleFeature] = { + def readSimpleFeatures(url: URL): Seq[SimpleFeature] = readSimpleFeatures(url, DEFAULT_CHARSET) + def readSimpleFeatures(url: URL, charSet: Charset): Seq[SimpleFeature] = { // Extract the features as GeoTools 'SimpleFeatures' val ds = new ShapefileDataStore(url) + ds.setCharset(charSet) val ftItr: SimpleFeatureIterator = ds.getFeatureSource.getFeatures.features try { @@ -62,99 +68,123 @@ object ShapeFileReader { } } - def readPointFeatures(path: String): Seq[PointFeature[Map[String,Object]]] = - readSimpleFeatures(path) + def readPointFeatures(path: String): Seq[PointFeature[Map[String,Object]]] = readPointFeatures(path, DEFAULT_CHARSET) + def readPointFeatures(path: String, charSet: Charset): Seq[PointFeature[Map[String,Object]]] = + readSimpleFeatures(path, charSet) .flatMap { ft => ft.geom[Point].map(PointFeature(_, ft.attributeMap)) } - def readPointFeatures[D](path: String, dataField: String): Seq[PointFeature[D]] = - readSimpleFeatures(path) + def readPointFeatures[D](path: String, dataField: String): Seq[PointFeature[D]] = readPointFeatures(path, dataField, DEFAULT_CHARSET) + def readPointFeatures[D](path: String, dataField: String, charSet: Charset): Seq[PointFeature[D]] = + readSimpleFeatures(path, charSet) .flatMap { ft => ft.geom[Point].map(PointFeature(_, ft.attribute[D](dataField))) } - def readPointFeatures(url: URL): Seq[PointFeature[Map[String,Object]]] = - readSimpleFeatures(url) + def readPointFeatures(url: URL): Seq[PointFeature[Map[String,Object]]] = readPointFeatures(url, DEFAULT_CHARSET) + def readPointFeatures(url: URL, charSet: Charset): Seq[PointFeature[Map[String,Object]]] = + readSimpleFeatures(url, charSet) .flatMap { ft => ft.geom[Point].map(PointFeature(_, ft.attributeMap)) } - def readPointFeatures[D](url: URL, dataField: String): Seq[PointFeature[D]] = - readSimpleFeatures(url) + def readPointFeatures[D](url: URL, dataField: String): Seq[PointFeature[D]] = readPointFeatures(url, dataField, DEFAULT_CHARSET) + def readPointFeatures[D](url: URL, dataField: String, charSet: Charset): Seq[PointFeature[D]] = + readSimpleFeatures(url, charSet) .flatMap { ft => ft.geom[Point].map(PointFeature(_, ft.attribute[D](dataField))) } - def readLineFeatures(path: String): Seq[Feature[LineString, Map[String,Object]]] = - readSimpleFeatures(path) + def readLineFeatures(path: String): Seq[Feature[LineString, Map[String,Object]]] = readLineFeatures(path, DEFAULT_CHARSET) + def readLineFeatures(path: String, charSet: Charset): Seq[Feature[LineString, Map[String,Object]]] = + readSimpleFeatures(path, charSet) .flatMap { ft => ft.geom[LineString].map(Feature(_, ft.attributeMap)) } - def readLineFeatures[D](path: String, dataField: String): Seq[Feature[LineString, D]] = - readSimpleFeatures(path) + def readLineFeatures[D](path: String, dataField: String): Seq[Feature[LineString, D]] = readLineFeatures(path, dataField, DEFAULT_CHARSET) + def readLineFeatures[D](path: String, dataField: String, charSet: Charset): Seq[Feature[LineString, D]] = + readSimpleFeatures(path, charSet) .flatMap { ft => ft.geom[LineString].map(Feature(_, ft.attribute[D](dataField))) } - def readLineFeatures(url: URL): Seq[Feature[LineString, Map[String,Object]]] = - readSimpleFeatures(url) + def readLineFeatures(url: URL): Seq[Feature[LineString, Map[String,Object]]] = readLineFeatures(url, DEFAULT_CHARSET) + def readLineFeatures(url: URL, charSet: Charset): Seq[Feature[LineString, Map[String,Object]]] = + readSimpleFeatures(url, charSet) .flatMap { ft => ft.geom[LineString].map(Feature(_, ft.attributeMap)) } - def readLineFeatures[D](url: URL, dataField: String): Seq[Feature[LineString, D]] = - readSimpleFeatures(url) + def readLineFeatures[D](url: URL, dataField: String): Seq[Feature[LineString, D]] = readLineFeatures(url, dataField, DEFAULT_CHARSET) + def readLineFeatures[D](url: URL, dataField: String, charSet: Charset): Seq[Feature[LineString, D]] = + readSimpleFeatures(url, charSet) .flatMap { ft => ft.geom[LineString].map(Feature(_, ft.attribute[D](dataField))) } - def readPolygonFeatures(path: String): Seq[PolygonFeature[Map[String,Object]]] = - readSimpleFeatures(path) + def readPolygonFeatures(path: String): Seq[PolygonFeature[Map[String,Object]]] = readPolygonFeatures(path, DEFAULT_CHARSET) + def readPolygonFeatures(path: String, charSet: Charset): Seq[PolygonFeature[Map[String,Object]]] = + readSimpleFeatures(path, charSet) .flatMap { ft => ft.geom[Polygon].map(PolygonFeature(_, ft.attributeMap)) } - def readPolygonFeatures[D](path: String, dataField: String): Seq[PolygonFeature[D]] = - readSimpleFeatures(path) + def readPolygonFeatures[D](path: String, dataField: String): Seq[PolygonFeature[D]] = readPolygonFeatures(path, dataField, DEFAULT_CHARSET) + def readPolygonFeatures[D](path: String, dataField: String, charSet: Charset): Seq[PolygonFeature[D]] = + readSimpleFeatures(path, charSet) .flatMap { ft => ft.geom[Polygon].map(PolygonFeature(_, ft.attribute[D](dataField))) } - def readPolygonFeatures(url: URL): Seq[PolygonFeature[Map[String,Object]]] = - readSimpleFeatures(url) + def readPolygonFeatures(url: URL): Seq[PolygonFeature[Map[String,Object]]] = readPolygonFeatures(url, DEFAULT_CHARSET) + def readPolygonFeatures(url: URL, charSet: Charset): Seq[PolygonFeature[Map[String,Object]]] = + readSimpleFeatures(url, charSet) .flatMap { ft => ft.geom[Polygon].map(PolygonFeature(_, ft.attributeMap)) } - def readPolygonFeatures[D](url: URL, dataField: String): Seq[PolygonFeature[D]] = - readSimpleFeatures(url) + def readPolygonFeatures[D](url: URL, dataField: String): Seq[PolygonFeature[D]] = readPolygonFeatures(url, dataField, DEFAULT_CHARSET) + def readPolygonFeatures[D](url: URL, dataField: String, charSet: Charset): Seq[PolygonFeature[D]] = + readSimpleFeatures(url, charSet) .flatMap { ft => ft.geom[Polygon].map(PolygonFeature(_, ft.attribute[D](dataField))) } - def readMultiPointFeatures(path: String): Seq[MultiPointFeature[Map[String,Object]]] = - readSimpleFeatures(path) + def readMultiPointFeatures(path: String): Seq[MultiPointFeature[Map[String,Object]]] = readMultiPointFeatures(path, DEFAULT_CHARSET) + def readMultiPointFeatures(path: String, charSet: Charset): Seq[MultiPointFeature[Map[String,Object]]] = + readSimpleFeatures(path, charSet) .flatMap { ft => ft.geom[MultiPoint].map(MultiPointFeature(_, ft.attributeMap)) } - def readMultiPointFeatures[D](path: String, dataField: String): Seq[MultiPointFeature[D]] = - readSimpleFeatures(path) + def readMultiPointFeatures[D](path: String, dataField: String): Seq[MultiPointFeature[D]] = readMultiPointFeatures(path, dataField, DEFAULT_CHARSET) + def readMultiPointFeatures[D](path: String, dataField: String, charSet: Charset): Seq[MultiPointFeature[D]] = + readSimpleFeatures(path, charSet) .flatMap { ft => ft.geom[MultiPoint].map(MultiPointFeature(_, ft.attribute[D](dataField))) } - def readMultiPointFeatures(url: URL): Seq[MultiPointFeature[Map[String,Object]]] = - readSimpleFeatures(url) + def readMultiPointFeatures(url: URL): Seq[MultiPointFeature[Map[String,Object]]] = readMultiPointFeatures(url, DEFAULT_CHARSET) + def readMultiPointFeatures(url: URL, charSet: Charset): Seq[MultiPointFeature[Map[String,Object]]] = + readSimpleFeatures(url, charSet) .flatMap { ft => ft.geom[MultiPoint].map(MultiPointFeature(_, ft.attributeMap)) } - def readMultiPointFeatures[D](url: URL, dataField: String): Seq[MultiPointFeature[D]] = - readSimpleFeatures(url) + def readMultiPointFeatures[D](url: URL, dataField: String): Seq[MultiPointFeature[D]] = readMultiPointFeatures(url, dataField, DEFAULT_CHARSET) + def readMultiPointFeatures[D](url: URL, dataField: String, charSet: Charset): Seq[MultiPointFeature[D]] = + readSimpleFeatures(url, charSet) .flatMap { ft => ft.geom[MultiPoint].map(MultiPointFeature(_, ft.attribute[D](dataField))) } - def readMultiLineFeatures(path: String): Seq[Feature[MultiLineString, Map[String,Object]]] = - readSimpleFeatures(path) + def readMultiLineFeatures(path: String): Seq[Feature[MultiLineString, Map[String,Object]]] = readMultiLineFeatures(path, DEFAULT_CHARSET) + def readMultiLineFeatures(path: String, charSet: Charset): Seq[Feature[MultiLineString, Map[String,Object]]] = + readSimpleFeatures(path, charSet) .flatMap { ft => ft.geom[MultiLineString].map(Feature(_, ft.attributeMap)) } - def readMultiLineFeatures[D](path: String, dataField: String): Seq[Feature[MultiLineString, D]] = - readSimpleFeatures(path) + def readMultiLineFeatures[D](path: String, dataField: String): Seq[Feature[MultiLineString, D]] = readMultiLineFeatures(path, dataField, DEFAULT_CHARSET) + def readMultiLineFeatures[D](path: String, dataField: String, charSet: Charset): Seq[Feature[MultiLineString, D]] = + readSimpleFeatures(path, charSet) .flatMap { ft => ft.geom[MultiLineString].map(Feature(_, ft.attribute[D](dataField))) } - def readMultiLineFeatures(url: URL): Seq[Feature[MultiLineString, Map[String,Object]]] = - readSimpleFeatures(url) + def readMultiLineFeatures(url: URL): Seq[Feature[MultiLineString, Map[String,Object]]] = readMultiLineFeatures(url, DEFAULT_CHARSET) + def readMultiLineFeatures(url: URL, charSet: Charset): Seq[Feature[MultiLineString, Map[String,Object]]] = + readSimpleFeatures(url, charSet) .flatMap { ft => ft.geom[MultiLineString].map(Feature(_, ft.attributeMap)) } - def readMultiLineFeatures[D](url: URL, dataField: String): Seq[Feature[MultiLineString, D]] = - readSimpleFeatures(url) + def readMultiLineFeatures[D](url: URL, dataField: String): Seq[Feature[MultiLineString, D]] = readMultiLineFeatures(url, dataField, DEFAULT_CHARSET) + def readMultiLineFeatures[D](url: URL, dataField: String, charSet: Charset): Seq[Feature[MultiLineString, D]] = + readSimpleFeatures(url, charSet) .flatMap { ft => ft.geom[MultiLineString].map(Feature(_, ft.attribute[D](dataField))) } - def readMultiPolygonFeatures(path: String): Seq[MultiPolygonFeature[Map[String,Object]]] = - readSimpleFeatures(path) + def readMultiPolygonFeatures(path: String): Seq[MultiPolygonFeature[Map[String,Object]]] = readMultiPolygonFeatures(path, DEFAULT_CHARSET) + def readMultiPolygonFeatures(path: String, charSet: Charset): Seq[MultiPolygonFeature[Map[String,Object]]] = + readSimpleFeatures(path, charSet) .flatMap { ft => ft.geom[MultiPolygon].map(MultiPolygonFeature(_, ft.attributeMap)) } - def readMultiPolygonFeatures[D](path: String, dataField: String): Seq[MultiPolygonFeature[D]] = - readSimpleFeatures(path) + def readMultiPolygonFeatures[D](path: String, dataField: String): Seq[MultiPolygonFeature[D]] = readMultiPolygonFeatures(path, dataField, DEFAULT_CHARSET) + def readMultiPolygonFeatures[D](path: String, dataField: String, charSet: Charset): Seq[MultiPolygonFeature[D]] = + readSimpleFeatures(path, charSet) .flatMap { ft => ft.geom[MultiPolygon].map(MultiPolygonFeature(_, ft.attribute[D](dataField))) } - def readMultiPolygonFeatures(url: URL): Seq[MultiPolygonFeature[Map[String,Object]]] = - readSimpleFeatures(url) + def readMultiPolygonFeatures(url: URL): Seq[MultiPolygonFeature[Map[String,Object]]] = readMultiPolygonFeatures(url, DEFAULT_CHARSET) + def readMultiPolygonFeatures(url: URL, charSet: Charset): Seq[MultiPolygonFeature[Map[String,Object]]] = + readSimpleFeatures(url, charSet) .flatMap { ft => ft.geom[MultiPolygon].map(MultiPolygonFeature(_, ft.attributeMap)) } - def readMultiPolygonFeatures[D](url: URL, dataField: String): Seq[MultiPolygonFeature[D]] = - readSimpleFeatures(url) + def readMultiPolygonFeatures[D](url: URL, dataField: String): Seq[MultiPolygonFeature[D]] = readMultiPolygonFeatures(url, dataField, DEFAULT_CHARSET) + def readMultiPolygonFeatures[D](url: URL, dataField: String, charSet: Charset): Seq[MultiPolygonFeature[D]] = + readSimpleFeatures(url, charSet) .flatMap { ft => ft.geom[MultiPolygon].map(MultiPolygonFeature(_, ft.attribute[D](dataField))) } } diff --git a/shapefile/src/test/scala/geotrellis/shapefile/ShapeFileReaderSpec.scala b/shapefile/src/test/scala/geotrellis/shapefile/ShapeFileReaderSpec.scala index 10a6abf7b1..7178c231f1 100644 --- a/shapefile/src/test/scala/geotrellis/shapefile/ShapeFileReaderSpec.scala +++ b/shapefile/src/test/scala/geotrellis/shapefile/ShapeFileReaderSpec.scala @@ -16,6 +16,9 @@ package geotrellis.shapefile +import java.net.URL +import java.nio.charset.Charset + import geotrellis.vector._ import org.scalatest.matchers.should.Matchers @@ -31,5 +34,18 @@ class ShapeFileReaderSpec extends AnyFunSpec with Matchers { data.keys.toSeq should be (Seq("LowIncome", "gbcode", "ename", "WorkingAge", "TotalPop", "Employment")) } } + + // https://github.com/locationtech/geotrellis/issues/3445 + it("should read UTF-8 MultiPolygons feature attributes") { + val path = "shapefile/data/shapefiles/demographics-utf8/demographics.shp" + val features = ShapeFileReader.readMultiPolygonFeatures(path, Charset.forName("UTF-8")) + features.size should be (160) + + features.take(4).map(_.data("ename").asInstanceOf[String]) shouldBe Seq("南关街道", "七里烟香", "谢庄镇", "Cheng Guan Zhen") + + val featuresInvalid = ShapeFileReader.readMultiPolygonFeatures(path) + val enames = featuresInvalid.take(4).map(_.data("ename").asInstanceOf[String]) + enames should not be Seq("南关街道", "七里烟香", "谢庄镇", "Cheng Guan Zhen") + } } }