From 76ec482cd99d5ada837fa24461a64b22c330c14e Mon Sep 17 00:00:00 2001 From: Rui Mo Date: Wed, 15 Dec 2021 19:25:54 +0800 Subject: [PATCH 1/2] remove arrow datasource --- arrow-data-source/.travis.yml | 45 -- arrow-data-source/README.md | 231 --------- arrow-data-source/common/pom.xml | 111 ---- .../arrow/SparkManagedAllocationListener.java | 43 -- arrow-data-source/parquet/pom.xml | 27 - .../v2/parquet/ServiceLoaderUtil.scala | 32 -- .../parquet/ParquetFileFormat.scala | 479 ------------------ .../parquet/ParquetFileFormatIndicator.scala | 22 - .../datasources/parquet/ParquetSQLConf.scala | 40 -- .../v2/parquet/ParquetFileFormatTest.scala | 99 ---- arrow-data-source/pom.xml | 244 --------- .../resource/arrowdatasource_validation.png | Bin 47609 -> 0 bytes arrow-data-source/script/build_arrow.sh | 106 ---- arrow-data-source/standard/pom.xml | 105 ---- ...pache.spark.sql.sources.DataSourceRegister | 1 - .../oap/spark/sql/ArrowWriteExtension.scala | 163 ------ .../intel/oap/spark/sql/ArrowWriteQueue.scala | 134 ----- .../spark/sql/DataFrameReaderImplicits.scala | 49 -- .../spark/sql/DataFrameWriterImplicits.scala | 34 -- .../datasources/arrow/ArrowFileFormat.scala | 171 ------- .../v2/arrow/ArrowDataSourceV2.scala | 42 -- .../datasources/v2/arrow/ArrowFilters.scala | 206 -------- .../datasources/v2/arrow/ArrowOptions.scala | 49 -- .../arrow/ArrowPartitionReaderFactory.scala | 129 ----- .../datasources/v2/arrow/ArrowSQLConf.scala | 35 -- .../datasources/v2/arrow/ArrowScan.scala | 66 --- .../v2/arrow/ArrowScanBuilder.scala | 57 --- .../datasources/v2/arrow/ArrowTable.scala | 52 -- .../datasources/v2/arrow/ArrowUtils.scala | 142 ------ .../standard/src/test/resources/cars.csv | 7 - .../src/test/resources/example-tab.csv | 35 -- .../standard/src/test/resources/example.csv | 35 -- .../standard/src/test/resources/people.csv | 3 - .../arrow/ArrowDataSourceTPCHBasedTest.scala | 286 ----------- .../arrow/ArrowDataSourceTest.scala | 373 -------------- native-sql-engine/core/pom.xml | 32 +- .../v2/arrow/NativeSQLMemoryConsumer.java | 0 .../v2/arrow/NativeSQLMemoryMetrics.java | 0 .../arrow/SparkManagedAllocationListener.java | 81 +++ .../SparkManagedReservationListener.java | 0 .../datasources/v2/arrow/Spiller.java | 0 .../vectorized/ArrowWritableColumnVector.java | 0 .../execution/ArrowColumnarToRowExec.scala | 1 - .../BasicPhysicalOperatorTransformer.scala | 1 - .../execution/BatchScanExecTransformer.scala | 24 +- .../BroadcastHashJoinExecTransformer.scala | 1 - .../oap/execution/CoalesceBatchesExec.scala | 12 +- .../oap/execution/ColumnarDataSourceRDD.scala | 137 ----- .../oap/execution/ExpandExecTransformer.scala | 1 - .../HashAggregateExecTransformer.scala | 1 - .../execution/RowToArrowColumnarExec.scala | 2 +- .../ShuffledHashJoinExecTransformer.scala | 1 - .../oap/execution/SortExecTransformer.scala | 1 - .../SortMergeJoinExecTransformer.scala | 1 - .../oap/execution/WholestageColumnarRDD.scala | 1 - .../oap/execution/WindowExecTransformer.scala | 2 - .../intel/oap/expression/CodeGeneration.scala | 1 - .../intel/oap/expression/ConverterUtils.scala | 3 +- .../expression/UnaryOperatorTransformer.scala | 1 - .../oap/extension/ColumnarOverrides.scala | 1 - .../v2/arrow/SparkMemoryUtils.scala | 8 +- .../v2/arrow/SparkSchemaUtils.scala | 0 .../v2/arrow/SparkVectorUtils.scala | 0 .../ArrowEvalPythonExecTransformer.scala | 1 - pom.xml | 1 - 65 files changed, 118 insertions(+), 3850 deletions(-) delete mode 100644 arrow-data-source/.travis.yml delete mode 100644 arrow-data-source/README.md delete mode 100644 arrow-data-source/common/pom.xml delete mode 100644 arrow-data-source/common/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/SparkManagedAllocationListener.java delete mode 100644 arrow-data-source/parquet/pom.xml delete mode 100644 arrow-data-source/parquet/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/parquet/ServiceLoaderUtil.scala delete mode 100644 arrow-data-source/parquet/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala delete mode 100644 arrow-data-source/parquet/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormatIndicator.scala delete mode 100644 arrow-data-source/parquet/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSQLConf.scala delete mode 100644 arrow-data-source/parquet/src/test/scala/com/intel/oap/spark/sql/execution/datasources/v2/parquet/ParquetFileFormatTest.scala delete mode 100644 arrow-data-source/pom.xml delete mode 100644 arrow-data-source/resource/arrowdatasource_validation.png delete mode 100755 arrow-data-source/script/build_arrow.sh delete mode 100644 arrow-data-source/standard/pom.xml delete mode 100644 arrow-data-source/standard/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister delete mode 100644 arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/ArrowWriteExtension.scala delete mode 100644 arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/ArrowWriteQueue.scala delete mode 100644 arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/DataFrameReaderImplicits.scala delete mode 100644 arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/DataFrameWriterImplicits.scala delete mode 100644 arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowFileFormat.scala delete mode 100644 arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowDataSourceV2.scala delete mode 100644 arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowFilters.scala delete mode 100644 arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowOptions.scala delete mode 100644 arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowPartitionReaderFactory.scala delete mode 100644 arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowSQLConf.scala delete mode 100644 arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowScan.scala delete mode 100644 arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowScanBuilder.scala delete mode 100644 arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowTable.scala delete mode 100644 arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowUtils.scala delete mode 100644 arrow-data-source/standard/src/test/resources/cars.csv delete mode 100644 arrow-data-source/standard/src/test/resources/example-tab.csv delete mode 100644 arrow-data-source/standard/src/test/resources/example.csv delete mode 100644 arrow-data-source/standard/src/test/resources/people.csv delete mode 100644 arrow-data-source/standard/src/test/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowDataSourceTPCHBasedTest.scala delete mode 100644 arrow-data-source/standard/src/test/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowDataSourceTest.scala rename {arrow-data-source/common => native-sql-engine/core}/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/NativeSQLMemoryConsumer.java (100%) rename {arrow-data-source/common => native-sql-engine/core}/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/NativeSQLMemoryMetrics.java (100%) create mode 100644 native-sql-engine/core/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/SparkManagedAllocationListener.java rename {arrow-data-source/common => native-sql-engine/core}/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/SparkManagedReservationListener.java (100%) rename {arrow-data-source/common => native-sql-engine/core}/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/Spiller.java (100%) rename {arrow-data-source/common => native-sql-engine/core}/src/main/java/com/intel/oap/vectorized/ArrowWritableColumnVector.java (100%) delete mode 100644 native-sql-engine/core/src/main/scala/com/intel/oap/execution/ColumnarDataSourceRDD.scala rename {arrow-data-source/common/src/main/scala/com/intel/oap/sql => native-sql-engine/core/src/main/scala/com/intel/oap}/execution/RowToArrowColumnarExec.scala (99%) rename {arrow-data-source/common => native-sql-engine/core}/src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkMemoryUtils.scala (98%) rename {arrow-data-source/common => native-sql-engine/core}/src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkSchemaUtils.scala (100%) rename {arrow-data-source/common => native-sql-engine/core}/src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkVectorUtils.scala (100%) diff --git a/arrow-data-source/.travis.yml b/arrow-data-source/.travis.yml deleted file mode 100644 index 5c938a101a06..000000000000 --- a/arrow-data-source/.travis.yml +++ /dev/null @@ -1,45 +0,0 @@ -sudo: required -dist: bionic -language: java -jdk: openjdk8 -jobs: - include: - - - name: oap-native-sql - dist: bionic - jdk: - - openjdk8 - before_install: - - echo ${TRAVIS_COMMIT_MESSAGE} - #- if [[ ${TRAVIS_COMMIT_MESSAGE} != \[oap-native-sql\]* ]]; then travis_terminate 0 ; fi ; - - sudo apt-get install cmake - - sudo apt-get install libboost-all-dev - - export | grep JAVA_HOME - install: - - # Download spark 3.0 - - "[ -f spark ] || mkdir spark && cd spark && wget http://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz && cd .." - - "tar -xf ./spark/spark-3.0.0-bin-hadoop2.7.tgz" - - "export SPARK_HOME=`pwd`/spark-3.0.0-bin-hadoop2.7" - before_script: - - cd /tmp - - git clone https://github.com/intel-bigdata/arrow.git - - cd arrow && git checkout oap-master && cd cpp - - sed -i "s/\${Python3_EXECUTABLE}/\/opt\/pyenv\/shims\/python3/g" CMakeLists.txt - - mkdir build && cd build - - cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON && make - - sudo make install - - cd ../../java - - mvn clean install -q -P arrow-jni -am -Darrow.cpp.build.dir=/tmp/arrow/cpp/build/release/ -DskipTests -Dcheckstyle.skip - script: - - cd ${TRAVIS_BUILD_DIR} - - git submodule init - - cd dep/arrow-data-source - - mvn clean -q install -DskipTests - - cd ${TRAVIS_BUILD_DIR}/core - - mvn clean -q package -DskipTests #skip core tests - - # run native sql unit tests - - count=0; while [ $count -le 3 ]; do echo "Elapsed 3 minutes"; sleep 180; let count++; done & # print log each 3 minutes for 3 times to avoid no-log issue - - mvn test -DmembersOnlySuites=org.apache.spark.sql.travis -DfailIfNoTests=false -Dexec.skip=true &> log-file.log # skip cpp build - - echo '#!/bin/bash' > grep.sh - - echo "module_tested=0; module_should_test=1; tests_total=0; while read -r line; do num=\$(echo \"\$line\" | grep -o -E '[0-9]+'); tests_total=\$((tests_total+num)); done <<<\"\$(grep \"Total number of tests run:\" log-file.log)\"; succeed_total=0; while read -r line; do [[ \$line =~ [^0-9]*([0-9]+)\, ]]; num=\${BASH_REMATCH[1]}; succeed_total=\$((succeed_total+num)); let module_tested++; done <<<\"\$(grep \"succeeded\" log-file.log)\"; if test \$tests_total -eq \$succeed_total -a \$module_tested -eq \$module_should_test; then echo \"All unit tests succeed\"; else echo \"Unit tests failed\"; exit 1; fi" >> grep.sh - - bash grep.sh diff --git a/arrow-data-source/README.md b/arrow-data-source/README.md deleted file mode 100644 index 4e71ee5149f1..000000000000 --- a/arrow-data-source/README.md +++ /dev/null @@ -1,231 +0,0 @@ -# Arrow Data Source - -A Spark DataSource implementation for reading files into Arrow compatible columnar vectors. - -## Note - -The development of this library is still in progress. As a result some of the functionality may not be constantly stable for being used in production environments that have not been fully considered due to the limited testing capabilities so far. - -## Build - -### Prerequisite - -There are some requirements before you build the project. -Please make sure you have already installed the software in your system. - -1. GCC 7.0 or higher version -2. java8 OpenJDK -> yum install java-1.8.0-openjdk -3. cmake 3.16 or higher version -4. maven 3.6 or higher version -5. Hadoop 2.7.5 or higher version -6. Spark 3.1.1 or higher version -7. Intel Optimized Arrow 4.0.0 - -### Building by Conda - -If you already have a working Hadoop Spark Cluster, we provide a Conda package which will automatically install dependencies needed by OAP, you can refer to [OAP-Installation-Guide](../docs/OAP-Installation-Guide.md) for more information. Once finished [OAP-Installation-Guide](../docs/OAP-Installation-Guide.md), you can find built `spark-arrow-datasource-standard--jar-with-dependencies.jar` under `$HOME/miniconda2/envs/oapenv/oap_jars`. -Then you can just skip steps below and jump to [Get Started](#get-started). - -### cmake installation - -If you are facing some trouble when installing cmake, please follow below steps to install cmake. - -``` -// installing cmake 3.16.1 -sudo yum install cmake3 - -// If you have an existing cmake, you can use below command to set it as an option within alternatives command -sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake 10 --slave /usr/local/bin/ctest ctest /usr/bin/ctest --slave /usr/local/bin/cpack cpack /usr/bin/cpack --slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake --family cmake - -// Set cmake3 as an option within alternatives command -sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake3 20 --slave /usr/local/bin/ctest ctest /usr/bin/ctest3 --slave /usr/local/bin/cpack cpack /usr/bin/cpack3 --slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake3 --family cmake - -// Use alternatives to choose cmake version -sudo alternatives --config cmake -``` - -### maven installation - -If you are facing some trouble when installing maven, please follow below steps to install maven - -``` -// installing maven 3.6.3 -Go to https://maven.apache.org/download.cgi and download the specific version of maven - -// Below command use maven 3.6.3 as an example -wget https://ftp.wayne.edu/apache/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz -tar xzf apache-maven-3.6.3-bin.tar.gz -mkdir /usr/local/maven -mv apache-maven-3.6.3/ /usr/local/maven/ - -// Set maven 3.6.3 as an option within alternatives command -sudo alternatives --install /usr/bin/mvn mvn /usr/local/maven/apache-maven-3.6.3/bin/mvn 1 - -// Use alternatives to choose mvn version -sudo alternatives --config mvn -``` - -### Hadoop Native Library(Default) - -Please make sure you have set up Hadoop directory properly with Hadoop Native Libraries -By default, Apache Arrow would scan `$HADOOP_HOME` and find the native Hadoop library `libhdfs.so`(under `$HADOOP_HOME/lib/native` directory) to be used for Hadoop client. - -You can also use `ARROW_LIBHDFS_DIR` to configure the location of `libhdfs.so` if it is installed in other directory than `$HADOOP_HOME/lib/native` - -If your SPARK and HADOOP are separated in different nodes, please find `libhdfs.so` in your Hadoop cluster and copy it to SPARK cluster, then use one of the above methods to set it properly. - -For more information, please check -Arrow HDFS interface [documentation](https://github.com/apache/arrow/blob/master/cpp/apidoc/HDFS.md) -Hadoop Native Library, please read the official Hadoop website [documentation](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/NativeLibraries.html) - -### Use libhdfs3 library for better performance(Optional) - -For better performance ArrowDataSource reads HDFS files using the third-party library `libhdfs3`. The library must be pre-installed on machines Spark Executor nodes are running on. - -To install the library, use of [Conda](https://docs.conda.io/en/latest/) is recommended. - -``` -// installing libhdfs3 -conda install -c conda-forge libhdfs3 - -// check the installed library file -ll ~/miniconda/envs/$(YOUR_ENV_NAME)/lib/libhdfs3.so -``` - -To set up libhdfs3, there are two different ways: -Option1: Overwrite the soft link for libhdfs.so -To install libhdfs3.so, you have to create a soft link for libhdfs.so in your Hadoop directory(`$HADOOP_HOME/lib/native` by default). - -``` -ln -f -s libhdfs3.so libhdfs.so -``` - -Option2: -Add env variable to the system -``` -export ARROW_LIBHDFS3_DIR="PATH_TO_LIBHDFS3_DIR/" -``` - -Add following Spark configuration options before running the DataSource to make the library to be recognized: -* `spark.executorEnv.ARROW_LIBHDFS3_DIR = "PATH_TO_LIBHDFS3_DIR/"` -* `spark.executorEnv.LD_LIBRARY_PATH = "PATH_TO_LIBHDFS3_DEPENDENCIES_DIR/"` - -Please notes: If you choose to use libhdfs3.so, there are some other dependency libraries you have to installed such as libprotobuf or libcrypto. - -### Build and install IntelĀ® Optimized Arrow with Datasets Java API -You have to use a customized Arrow to support for our datasets Java API. - -``` -// build arrow-cpp -git clone -b arrow-4.0.0-oap https://github.com/oap-project/arrow.git -cd arrow/cpp -mkdir build -cd build -cmake -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JNI=ON -DARROW_DATASET=ON -DARROW_WITH_PROTOBUF=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_LZ4=ON -DARROW_FILESYSTEM=ON -DARROW_JSON=ON .. -make - -// build and install arrow jvm library -cd ../../java -mvn clean install -P arrow-jni -am -Darrow.cpp.build.dir=$PATH_TO_ARROW_SOURCE_CODE/arrow/cpp/build/release -``` - -### Build Arrow Data Source Library - -``` -// Download Arrow Data Source Code -git clone -b https://github.com/oap-project/arrow-data-source.git - -// Go to the directory -cd arrow-data-source - -// build -mvn clean -DskipTests package - -// check built jar library -readlink -f standard/target/spark-arrow-datasource-standard--jar-with-dependencies.jar -``` - -### Download Spark 3.1.1 - -Currently ArrowDataSource works on the Spark 3.1.1 version. - -``` -wget http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz -tar -xf ./spark-3.1.1-bin-hadoop2.7.tgz -export SPARK_HOME=`pwd`/spark-3.1.1-bin-hadoop2.7 -``` - -If you are new to Apache Spark, please go though [Spark's official deploying guide](https://spark.apache.org/docs/latest/cluster-overview.html) before getting started with ArrowDataSource. - -## Get started -### Add extra class pathes to Spark - -To enable ArrowDataSource, the previous built jar `spark-arrow-datasource-standard--jar-with-dependencies.jar` should be added to Spark configuration. Typically the options are: - -* `spark.driver.extraClassPath` : Set to load jar file to driver. -* `spark.executor.extraClassPath` : Set to load jar file to executor. -* `jars` : Set to copy jar file to the executors when using yarn cluster mode. -* `spark.executorEnv.ARROW_LIBHDFS3_DIR` : Optional if you are using a custom libhdfs3.so. -* `spark.executorEnv.LD_LIBRARY_PATH` : Optional if you are using a custom libhdfs3.so. - -For Spark Standalone Mode, please set the above value as relative path to the jar file. -For Spark Yarn Cluster Mode, please set the above value as absolute path to the jar file. - -Example to run Spark Shell with ArrowDataSource jar file -``` -${SPARK_HOME}/bin/spark-shell \ - --verbose \ - --master yarn \ - --driver-memory 10G \ - --conf spark.driver.extraClassPath=$PATH_TO_DATASOURCE_DIR/spark-arrow-datasource-standard--jar-with-dependencies.jar \ - --conf spark.executor.extraClassPath=$PATH_TO_DATASOURCE_DIR/spark-arrow-datasource-standard--jar-with-dependencies.jar \ - --conf spark.driver.cores=1 \ - --conf spark.executor.instances=12 \ - --conf spark.executor.cores=6 \ - --conf spark.executor.memory=20G \ - --conf spark.memory.offHeap.size=80G \ - --conf spark.task.cpus=1 \ - --conf spark.locality.wait=0s \ - --conf spark.sql.shuffle.partitions=72 \ - --conf spark.executorEnv.ARROW_LIBHDFS3_DIR="$PATH_TO_LIBHDFS3_DIR/" \ - --conf spark.executorEnv.LD_LIBRARY_PATH="$PATH_TO_LIBHDFS3_DEPENDENCIES_DIR" -``` - -For more information about these options, please read the official Spark [documentation](https://spark.apache.org/docs/latest/configuration.html#runtime-environment). - -### Run a query with ArrowDataSource (Scala) - -```scala -val path = "${PATH_TO_YOUR_PARQUET_FILE}" -val df = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") - .format("arrow") - .load(path) -df.createOrReplaceTempView("my_temp_view") -spark.sql("SELECT * FROM my_temp_view LIMIT 10").show(10) -``` -### To validate if ArrowDataSource works properly - -To validate if ArrowDataSource works, you can go to the DAG to check if ArrowScan has been used from the above example query. - -![Image of ArrowDataSource Validation](../docs/image/arrowdatasource_validation.png) - - -## Work together with ParquetDataSource (experimental) - -We provide a customized replacement of Spark's built-in ParquetFileFormat. By so users don't have -to change existing Parquet-based SQL/code and will be able to read Arrow data from Parquet directly. -More importantly, sometimes the feature could be extremely helpful to make ArrowDataSource work correctly -with some 3rd-party storage tools (e.g. [Delta Lake](https://github.com/delta-io/delta)) that are built on top of ParquetDataSource. - -To replace built-in ParquetDataSource, the only thing has to be done is to place compiled jar `spark-arrow-datasource-parquet-.jar` into -Spark's library folder. - -If you'd like to verify that ParquetDataSource is successfully overwritten by the jar, run following code -before executing SQL job: -``` -ServiceLoaderUtil.ensureParquetFileFormatOverwritten(); -``` - -Note the whole feature is currently **experimental** and only DataSource v1 is supported. V2 support is being planned. diff --git a/arrow-data-source/common/pom.xml b/arrow-data-source/common/pom.xml deleted file mode 100644 index 6372f21d1839..000000000000 --- a/arrow-data-source/common/pom.xml +++ /dev/null @@ -1,111 +0,0 @@ - - - - spark-arrow-datasource - com.intel.oap - 1.2.0-snapshot - ../pom.xml - - - 4.0.0 - spark-arrow-datasource-common - - - - org.apache.arrow - ${arrow-memory.artifact} - ${arrow.version} - runtime - - - org.apache.arrow - arrow-dataset - ${arrow.version} - - - io.netty - netty-common - - - io.netty - netty-buffer - - - com.fasterxml.jackson.core - jackson-core - - - com.fasterxml.jackson.core - jackson-annotations - - - compile - - - - ${project.basedir}/src/main/scala - ${project.basedir}/src/test/scala - - - org.apache.maven.plugins - maven-source-plugin - - - attach-sources - - jar - - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.8.0 - - 1.8 - 1.8 - - - - compile - - compile - - - - - - org.codehaus.mojo - build-helper-maven-plugin - - - add-src-1 - generate-sources - - add-source - - - - ${project.basedir}/src/main/java - - - - - - - - - - - org.scala-tools - maven-scala-plugin - - ${scala.version} - - - - - diff --git a/arrow-data-source/common/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/SparkManagedAllocationListener.java b/arrow-data-source/common/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/SparkManagedAllocationListener.java deleted file mode 100644 index abdabcc26dc3..000000000000 --- a/arrow-data-source/common/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/SparkManagedAllocationListener.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.oap.spark.sql.execution.datasources.v2.arrow; - -import org.apache.arrow.memory.AllocationListener; - -public class SparkManagedAllocationListener implements AllocationListener { - - private final NativeSQLMemoryConsumer consumer; - private final NativeSQLMemoryMetrics metrics; - - public SparkManagedAllocationListener(NativeSQLMemoryConsumer consumer, NativeSQLMemoryMetrics metrics) { - this.consumer = consumer; - this.metrics = metrics; - } - - @Override - public void onPreAllocation(long size) { - consumer.acquire(size); - metrics.inc(size); - } - - @Override - public void onRelease(long size) { - consumer.free(size); - metrics.inc(-size); - } -} diff --git a/arrow-data-source/parquet/pom.xml b/arrow-data-source/parquet/pom.xml deleted file mode 100644 index 7e618fc887e5..000000000000 --- a/arrow-data-source/parquet/pom.xml +++ /dev/null @@ -1,27 +0,0 @@ - - - - spark-arrow-datasource - com.intel.oap - 1.2.0-snapshot - - 4.0.0 - - spark-arrow-datasource-parquet - - - ${project.basedir}/src/main/scala - ${project.basedir}/src/test/scala - - - - - com.intel.oap - spark-arrow-datasource-standard - ${project.version} - - - - diff --git a/arrow-data-source/parquet/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/parquet/ServiceLoaderUtil.scala b/arrow-data-source/parquet/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/parquet/ServiceLoaderUtil.scala deleted file mode 100644 index 001b1c26cf61..000000000000 --- a/arrow-data-source/parquet/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/parquet/ServiceLoaderUtil.scala +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.oap.spark.sql.execution.datasources.v2.parquet - -import java.util.Objects - -import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, ParquetFileFormatIndicator} - -object ServiceLoaderUtil { - def ensureParquetFileFormatOverwritten(): Unit = { - val fmt = new ParquetFileFormat() - if (!Objects.equals(fmt.toString(), ParquetFileFormatIndicator.OVERWRITTEN_INDICATOR)) { - throw new ClassNotFoundException("ParquetFileFormat is not overwritten by Arrow. Consider " + - "reordering jar dependencies to let the overwritten version to be recognized by JVM") - } - } -} diff --git a/arrow-data-source/parquet/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/arrow-data-source/parquet/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala deleted file mode 100644 index 836e9f2c92a8..000000000000 --- a/arrow-data-source/parquet/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ /dev/null @@ -1,479 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.datasources.parquet - -import java.io.IOException -import java.net.URI - -import scala.collection.JavaConverters._ -import scala.util.{Failure, Try} - -import com.intel.oap.spark.sql.execution.datasources.arrow.ArrowFileFormat -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileStatus, Path} -import org.apache.hadoop.mapreduce.{Job, JobID, OutputCommitter, TaskAttemptContext, TaskAttemptID, TaskID, TaskType} -import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl -import org.apache.parquet.filter2.compat.FilterCompat -import org.apache.parquet.filter2.predicate.FilterApi -import org.apache.parquet.format.converter.ParquetMetadataConverter.SKIP_ROW_GROUPS -import org.apache.parquet.hadoop.{Footer, ParquetFileReader, ParquetInputFormat, ParquetOutputCommitter, ParquetOutputFormat, ParquetRecordReader} -import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel -import org.apache.parquet.hadoop.codec.CodecConfig -import org.apache.parquet.hadoop.util.ContextUtil -import org.apache.parquet.schema.MessageType - -import org.apache.spark.TaskContext -import org.apache.spark.internal.Logging -import org.apache.spark.sql.{Row, SparkSession} -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.JoinedRow -import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection -import org.apache.spark.sql.catalyst.parser.LegacyTypeStringParser -import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.execution.datasources.{DataSourceUtils, OutputWriter, OutputWriterFactory, PartitionedFile, RecordReaderIterator, SchemaMergeUtils} -import org.apache.spark.sql.execution.datasources.parquet.ParquetSQLConf._ -import org.apache.spark.sql.execution.vectorized.{OffHeapColumnVector, OnHeapColumnVector} -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.{DataSourceRegister, Filter} -import org.apache.spark.sql.types.{ArrayType, AtomicType, DataType, MapType, StructType, UserDefinedType} -import org.apache.spark.util.{SerializableConfiguration, ThreadUtils} - -/** - * This is expected to overwrite built-in ParquetFileFormat. Read is redirected to ArrowFileFormat's - * read. - */ -class ParquetFileFormat - extends ArrowFileFormat - with DataSourceRegister - with Logging - with Serializable { - - override def shortName(): String = "parquet" - - override def toString: String = ParquetFileFormatIndicator.OVERWRITTEN_INDICATOR - - override def hashCode(): Int = getClass.hashCode() - - override def equals(other: Any): Boolean = other.isInstanceOf[ParquetFileFormat] - - /** - * We copy following codes from Spark's built-in ParquetFileFormat. It's not suggested to - * change any of the logic to make sure we are on the same boat with Spark when it comes - * to Parquet write. - */ - override def prepareWrite(sparkSession: SparkSession, - job: Job, - options: Map[String, String], - dataSchema: StructType): OutputWriterFactory = { - val parquetOptions = new ParquetOptions(options, sparkSession.sessionState.conf) - - val conf = ContextUtil.getConfiguration(job) - - val committerClass = - conf.getClass( - SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key, - classOf[ParquetOutputCommitter], - classOf[OutputCommitter]) - - if (conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key) == null) { - logInfo("Using default output committer for Parquet: " + - classOf[ParquetOutputCommitter].getCanonicalName) - } else { - logInfo("Using user defined output committer for Parquet: " + committerClass.getCanonicalName) - } - - conf.setClass( - SQLConf.OUTPUT_COMMITTER_CLASS.key, - committerClass, - classOf[OutputCommitter]) - - // We're not really using `ParquetOutputFormat[Row]` for writing data here, because we override - // it in `ParquetOutputWriter` to support appending and dynamic partitioning. The reason why - // we set it here is to setup the output committer class to `ParquetOutputCommitter`, which is - // bundled with `ParquetOutputFormat[Row]`. - job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]]) - - ParquetOutputFormat.setWriteSupportClass(job, classOf[ParquetWriteSupport]) - - // This metadata is useful for keeping UDTs like Vector/Matrix. - ParquetWriteSupport.setSchema(dataSchema, conf) - - // Sets flags for `ParquetWriteSupport`, which converts Catalyst schema to Parquet - // schema and writes actual rows to Parquet files. - conf.set( - SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key, - sparkSession.sessionState.conf.writeLegacyParquetFormat.toString) - - conf.set( - SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key, - sparkSession.sessionState.conf.parquetOutputTimestampType.toString) - - // Sets compression scheme - conf.set(ParquetOutputFormat.COMPRESSION, parquetOptions.compressionCodecClassName) - - // SPARK-15719: Disables writing Parquet summary files by default. - if (conf.get(ParquetOutputFormat.JOB_SUMMARY_LEVEL) == null - && conf.get(ParquetOutputFormat.ENABLE_JOB_SUMMARY) == null) { - conf.setEnum(ParquetOutputFormat.JOB_SUMMARY_LEVEL, JobSummaryLevel.NONE) - } - - if (ParquetOutputFormat.getJobSummaryLevel(conf) != JobSummaryLevel.NONE - && !classOf[ParquetOutputCommitter].isAssignableFrom(committerClass)) { - // output summary is requested, but the class is not a Parquet Committer - logWarning(s"Committer $committerClass is not a ParquetOutputCommitter and cannot" + - s" create job summaries. " + - s"Set Parquet option ${ParquetOutputFormat.JOB_SUMMARY_LEVEL} to NONE.") - } - - new OutputWriterFactory { - // This OutputWriterFactory instance is deserialized when writing Parquet files on the - // executor side without constructing or deserializing ParquetFileFormat. Therefore, we hold - // another reference to ParquetLogRedirector.INSTANCE here to ensure the latter class is - // initialized. - private val parquetLogRedirector = ParquetLogRedirector.INSTANCE - - override def newInstance( - path: String, - dataSchema: StructType, - context: TaskAttemptContext): OutputWriter = { - new ParquetOutputWriter(path, context) - } - - override def getFileExtension(context: TaskAttemptContext): String = { - CodecConfig.from(context).getCodec.getExtension + ".parquet" - } - } - } - - override def inferSchema(sparkSession: SparkSession, - parameters: Map[String, String], - files: Seq[FileStatus]): Option[StructType] = { - val overwrite = sparkSession.sqlContext.conf.overwriteParquetDataSourceRead - if (overwrite) { - return super.inferSchema(sparkSession, parameters, files) - } - ParquetUtils.inferSchema(sparkSession, parameters, files) - } - - /** - * Returns whether the reader will return the rows as batch or not. - */ - override def supportBatch(sparkSession: SparkSession, schema: StructType): Boolean = { - val conf = sparkSession.sessionState.conf - val overwrite = sparkSession.sqlContext.conf.overwriteParquetDataSourceRead - if (overwrite) { - return super.supportBatch(sparkSession, schema) - } - conf.parquetVectorizedReaderEnabled && conf.wholeStageEnabled && - schema.length <= conf.wholeStageMaxNumFields && - schema.forall(_.dataType.isInstanceOf[AtomicType]) - } - - override def vectorTypes(requiredSchema: StructType, - partitionSchema: StructType, - sqlConf: SQLConf): Option[Seq[String]] = { - val overwrite = sqlConf.overwriteParquetDataSourceRead - if (overwrite) { - return super.vectorTypes(requiredSchema, partitionSchema, sqlConf) - } - Option(Seq.fill(requiredSchema.fields.length + partitionSchema.fields.length)( - if (!sqlConf.offHeapColumnVectorEnabled) { - classOf[OnHeapColumnVector].getName - } else { - classOf[OffHeapColumnVector].getName - } - )) - } - - override def isSplitable(sparkSession: SparkSession, - options: Map[String, String], - path: Path): Boolean = { - val overwrite = sparkSession.sqlContext.conf.overwriteParquetDataSourceRead - if (overwrite) { - return super.isSplitable(sparkSession, options, path) - } - true - } - - override def buildReaderWithPartitionValues(sparkSession: SparkSession, - dataSchema: StructType, - partitionSchema: StructType, - requiredSchema: StructType, - filters: Seq[Filter], - options: Map[String, String], - hadoopConf: Configuration): - (PartitionedFile) => Iterator[InternalRow] = { - val overwrite = sparkSession.sqlContext.conf.overwriteParquetDataSourceRead - if (overwrite) { - return super.buildReaderWithPartitionValues(sparkSession, dataSchema, - partitionSchema, requiredSchema, filters, options, hadoopConf) - } - hadoopConf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[ParquetReadSupport].getName) - hadoopConf.set( - ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, - requiredSchema.json) - hadoopConf.set( - ParquetWriteSupport.SPARK_ROW_SCHEMA, - requiredSchema.json) - hadoopConf.set( - SQLConf.SESSION_LOCAL_TIMEZONE.key, - sparkSession.sessionState.conf.sessionLocalTimeZone) - hadoopConf.setBoolean( - SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key, - sparkSession.sessionState.conf.nestedSchemaPruningEnabled) - hadoopConf.setBoolean( - SQLConf.CASE_SENSITIVE.key, - sparkSession.sessionState.conf.caseSensitiveAnalysis) - - ParquetWriteSupport.setSchema(requiredSchema, hadoopConf) - - // Sets flags for `ParquetToSparkSchemaConverter` - hadoopConf.setBoolean( - SQLConf.PARQUET_BINARY_AS_STRING.key, - sparkSession.sessionState.conf.isParquetBinaryAsString) - hadoopConf.setBoolean( - SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, - sparkSession.sessionState.conf.isParquetINT96AsTimestamp) - - val broadcastedHadoopConf = - sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) - - // TODO: if you move this into the closure it reverts to the default values. - // If true, enable using the custom RecordReader for parquet. This only works for - // a subset of the types (no complex types). - val resultSchema = StructType(partitionSchema.fields ++ requiredSchema.fields) - val sqlConf = sparkSession.sessionState.conf - val enableOffHeapColumnVector = sqlConf.offHeapColumnVectorEnabled - val enableVectorizedReader: Boolean = - sqlConf.parquetVectorizedReaderEnabled && - resultSchema.forall(_.dataType.isInstanceOf[AtomicType]) - val enableRecordFilter: Boolean = sqlConf.parquetRecordFilterEnabled - val timestampConversion: Boolean = sqlConf.isParquetINT96TimestampConversion - val capacity = sqlConf.parquetVectorizedReaderBatchSize - val enableParquetFilterPushDown: Boolean = sqlConf.parquetFilterPushDown - // Whole stage codegen (PhysicalRDD) is able to deal with batches directly - val returningBatch = supportBatch(sparkSession, resultSchema) - val pushDownDate = sqlConf.parquetFilterPushDownDate - val pushDownTimestamp = sqlConf.parquetFilterPushDownTimestamp - val pushDownDecimal = sqlConf.parquetFilterPushDownDecimal - val pushDownStringStartWith = sqlConf.parquetFilterPushDownStringStartWith - val pushDownInFilterThreshold = sqlConf.parquetFilterPushDownInFilterThreshold - val isCaseSensitive = sqlConf.caseSensitiveAnalysis - - (file: PartitionedFile) => { - assert(file.partitionValues.numFields == partitionSchema.size) - - val filePath = new Path(new URI(file.filePath)) - val split = - new org.apache.parquet.hadoop.ParquetInputSplit( - filePath, - file.start, - file.start + file.length, - file.length, - Array.empty, - null) - - val sharedConf = broadcastedHadoopConf.value.value - - lazy val footerFileMetaData = - ParquetFileReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS).getFileMetaData - // Try to push down filters when filter push-down is enabled. - val pushed = if (enableParquetFilterPushDown) { - val parquetSchema = footerFileMetaData.getSchema - val parquetFilters = new ParquetFilters(parquetSchema, pushDownDate, pushDownTimestamp, - pushDownDecimal, pushDownStringStartWith, pushDownInFilterThreshold, isCaseSensitive) - filters - // Collects all converted Parquet filter predicates. Notice that not all predicates can be - // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap` - // is used here. - .flatMap(parquetFilters.createFilter(_)) - .reduceOption(FilterApi.and) - } else { - None - } - - // PARQUET_INT96_TIMESTAMP_CONVERSION says to apply timezone conversions to int96 timestamps' - // *only* if the file was created by something other than "parquet-mr", so check the actual - // writer here for this file. We have to do this per-file, as each file in the table may - // have different writers. - // Define isCreatedByParquetMr as function to avoid unnecessary parquet footer reads. - def isCreatedByParquetMr: Boolean = - footerFileMetaData.getCreatedBy().startsWith("parquet-mr") - - val convertTz = - if (timestampConversion && !isCreatedByParquetMr) { - Some(DateTimeUtils.getZoneId(sharedConf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key))) - } else { - None - } - - val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode( - footerFileMetaData.getKeyValueMetaData.get, - SQLConf.get.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ)) - - val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) - val hadoopAttemptContext = - new TaskAttemptContextImpl(broadcastedHadoopConf.value.value, attemptId) - - // Try to push down filters when filter push-down is enabled. - // Notice: This push-down is RowGroups level, not individual records. - if (pushed.isDefined) { - ParquetInputFormat.setFilterPredicate(hadoopAttemptContext.getConfiguration, pushed.get) - } - val taskContext = Option(TaskContext.get()) - if (enableVectorizedReader) { - val vectorizedReader = new VectorizedParquetRecordReader( - convertTz.orNull, - datetimeRebaseMode.toString, - "", - enableOffHeapColumnVector && taskContext.isDefined, - capacity) - val iter = new RecordReaderIterator(vectorizedReader) - // SPARK-23457 Register a task completion listener before `initialization`. - taskContext.foreach(_.addTaskCompletionListener[Unit](_ => iter.close())) - vectorizedReader.initialize(split, hadoopAttemptContext) - logDebug(s"Appending $partitionSchema ${file.partitionValues}") - vectorizedReader.initBatch(partitionSchema, file.partitionValues) - if (returningBatch) { - vectorizedReader.enableReturningBatches() - } - - // UnsafeRowParquetRecordReader appends the columns internally to avoid another copy. - iter.asInstanceOf[Iterator[InternalRow]] - } else { - logDebug(s"Falling back to parquet-mr") - // ParquetRecordReader returns InternalRow - val readSupport = new ParquetReadSupport( - convertTz, enableVectorizedReader = false, datetimeRebaseMode, SQLConf.LegacyBehaviorPolicy.LEGACY) - val reader = if (pushed.isDefined && enableRecordFilter) { - val parquetFilter = FilterCompat.get(pushed.get, null) - new ParquetRecordReader[InternalRow](readSupport, parquetFilter) - } else { - new ParquetRecordReader[InternalRow](readSupport) - } - val iter = new RecordReaderIterator[InternalRow](reader) - // SPARK-23457 Register a task completion listener before `initialization`. - taskContext.foreach(_.addTaskCompletionListener[Unit](_ => iter.close())) - reader.initialize(split, hadoopAttemptContext) - - val fullSchema = requiredSchema.toAttributes ++ partitionSchema.toAttributes - val unsafeProjection = GenerateUnsafeProjection.generate(fullSchema, fullSchema) - - if (partitionSchema.length == 0) { - // There is no partition columns - iter.map(unsafeProjection) - } else { - val joinedRow = new JoinedRow() - iter.map(d => unsafeProjection(joinedRow(d, file.partitionValues))) - } - } - } - } - - override def supportDataType(dataType: DataType): Boolean = dataType match { - case _: AtomicType => true - - case st: StructType => st.forall { f => supportDataType(f.dataType) } - - case ArrayType(elementType, _) => supportDataType(elementType) - - case MapType(keyType, valueType, _) => - supportDataType(keyType) && supportDataType(valueType) - - case udt: UserDefinedType[_] => supportDataType(udt.sqlType) - - case _ => false - } -} - -object ParquetFileFormat extends Logging { - /** - * Reads Spark SQL schema from a Parquet footer. If a valid serialized Spark SQL schema string - * can be found in the file metadata, returns the deserialized [[StructType]], otherwise, returns - * a [[StructType]] converted from the [[MessageType]] stored in this footer. - */ - def readSchemaFromFooter(footer: Footer, converter: ParquetToSparkSchemaConverter): StructType = { - val fileMetaData = footer.getParquetMetadata.getFileMetaData - fileMetaData - .getKeyValueMetaData - .asScala.toMap - .get(ParquetReadSupport.SPARK_METADATA_KEY) - .flatMap(deserializeSchemaString) - .getOrElse(converter.convert(fileMetaData.getSchema)) - } - - private def deserializeSchemaString(schemaString: String): Option[StructType] = { - // Tries to deserialize the schema string as JSON first, then falls back to the case class - // string parser (data generated by older versions of Spark SQL uses this format). - Try(DataType.fromJson(schemaString).asInstanceOf[StructType]).recover { - case _: Throwable => - logInfo( - "Serialized Spark schema in Parquet key-value metadata is not in JSON format, " + - "falling back to the deprecated DataType.fromCaseClassString parser.") - LegacyTypeStringParser.parseString(schemaString).asInstanceOf[StructType] - }.recoverWith { - case cause: Throwable => - logWarning( - "Failed to parse and ignored serialized Spark schema in " + - s"Parquet key-value metadata:\n\t$schemaString", cause) - Failure(cause) - }.toOption - } - - def mergeSchemasInParallel( - filesToTouch: Seq[FileStatus], - sparkSession: SparkSession): Option[StructType] = { - val assumeBinaryIsString = sparkSession.sessionState.conf.isParquetBinaryAsString - val assumeInt96IsTimestamp = sparkSession.sessionState.conf.isParquetINT96AsTimestamp - - val reader = (files: Seq[FileStatus], conf: Configuration, ignoreCorruptFiles: Boolean) => { - // Converter used to convert Parquet `MessageType` to Spark SQL `StructType` - val converter = new ParquetToSparkSchemaConverter( - assumeBinaryIsString = assumeBinaryIsString, - assumeInt96IsTimestamp = assumeInt96IsTimestamp) - - readParquetFootersInParallel(conf, files, ignoreCorruptFiles) - .map(ParquetFileFormat.readSchemaFromFooter(_, converter)) - } - - SchemaMergeUtils.mergeSchemasInParallel(sparkSession, null, filesToTouch, reader) - } - - private[parquet] def readParquetFootersInParallel( - conf: Configuration, - partFiles: Seq[FileStatus], - ignoreCorruptFiles: Boolean): Seq[Footer] = { - ThreadUtils.parmap(partFiles, "readingParquetFooters", 8) { currentFile => - try { - // Skips row group information since we only need the schema. - // ParquetFileReader.readFooter throws RuntimeException, instead of IOException, - // when it can't read the footer. - Some(new Footer(currentFile.getPath(), - ParquetFileReader.readFooter( - conf, currentFile, SKIP_ROW_GROUPS))) - } catch { case e: RuntimeException => - if (ignoreCorruptFiles) { - logWarning(s"Skipped the footer in the corrupted file: $currentFile", e) - None - } else { - throw new IOException(s"Could not read footer for file: $currentFile", e) - } - } - }.flatten - } -} diff --git a/arrow-data-source/parquet/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormatIndicator.scala b/arrow-data-source/parquet/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormatIndicator.scala deleted file mode 100644 index 29aa38069b46..000000000000 --- a/arrow-data-source/parquet/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormatIndicator.scala +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.datasources.parquet - -object ParquetFileFormatIndicator { - val OVERWRITTEN_INDICATOR = "Parquet-Overwritten-By-Arrow" -} diff --git a/arrow-data-source/parquet/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSQLConf.scala b/arrow-data-source/parquet/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSQLConf.scala deleted file mode 100644 index 9e3096189afa..000000000000 --- a/arrow-data-source/parquet/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSQLConf.scala +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.datasources.parquet - -import org.apache.spark.sql.internal.SQLConf - -object ParquetSQLConf { - // We default this option value to TRUE. This is because once the code is executed, the compiled - // arrow-datasource-parquet.jar file is supposed to be placed into Spark's lib folder. Which - // means it's user's intention to use the replaced ParquetDataSource. - val OVERWRITE_PARQUET_DATASOURCE_READ = - SQLConf.buildConf("spark.sql.arrow.overwrite.parquet.read") - .doc("Overwrite Parquet datasource v1 with reader of Arrow datasource.") - .booleanConf - .createWithDefault(true) - - implicit def fromSQLConf(c: SQLConf): ParquetSQLConf = { - new ParquetSQLConf(c) - } -} - -class ParquetSQLConf(c: SQLConf) { - def overwriteParquetDataSourceRead: Boolean = - c.getConf(ParquetSQLConf.OVERWRITE_PARQUET_DATASOURCE_READ) -} diff --git a/arrow-data-source/parquet/src/test/scala/com/intel/oap/spark/sql/execution/datasources/v2/parquet/ParquetFileFormatTest.scala b/arrow-data-source/parquet/src/test/scala/com/intel/oap/spark/sql/execution/datasources/v2/parquet/ParquetFileFormatTest.scala deleted file mode 100644 index 84e0fc410a05..000000000000 --- a/arrow-data-source/parquet/src/test/scala/com/intel/oap/spark/sql/execution/datasources/v2/parquet/ParquetFileFormatTest.scala +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.intel.oap.spark.sql.execution.datasources.v2.parquet - -import java.io.File - -import com.intel.oap.vectorized.ArrowWritableColumnVector - -import org.apache.spark.SparkConf -import org.apache.spark.sql.QueryTest -import org.apache.spark.sql.execution.FileSourceScanExec -import org.apache.spark.sql.execution.datasources.parquet.ParquetSQLConf -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils -import org.apache.spark.sql.test.SharedSparkSession - -class ParquetFileFormatTest extends QueryTest with SharedSparkSession { - - private val parquetFile1 = "parquet-1.parquet" - - override protected def sparkConf: SparkConf = { - val conf = super.sparkConf - conf.set("spark.memory.offHeap.size", String.valueOf(256 * 1024 * 1024)) - conf - } - - def closeAllocators(): Unit = { - SparkMemoryUtils.contextAllocator().close() - } - - test("overwrite write only") { - import testImplicits._ - withSQLConf(ParquetSQLConf.OVERWRITE_PARQUET_DATASOURCE_READ.key -> "false") { - ServiceLoaderUtil.ensureParquetFileFormatOverwritten() - spark.read - .json(Seq("{\"col\": -1}", "{\"col\": 0}", "{\"col\": 1}", "{\"col\": 2}", - "{\"col\": null}") - .toDS()) - .repartition(1) - .write - .mode("overwrite") - .parquet(ParquetFileFormatTest.locateResourcePath(parquetFile1)) - val path = ParquetFileFormatTest.locateResourcePath(parquetFile1) - val frame = spark.read.parquet(path) - val eplan = frame.queryExecution.executedPlan - assert(eplan.toString - .contains("Format: Parquet-Overwritten-By-Arrow")) - val scan = eplan.find(_.isInstanceOf[FileSourceScanExec]).get - val typeAssertions = scan.executeColumnar() - .flatMap(b => (0 until b.numCols()).map(b.column(_))) - .map(!_.isInstanceOf[ArrowWritableColumnVector]) - .collect() - assert(typeAssertions.forall(p => p)) - } - } - - test("overwrite read and write") { - import testImplicits._ - ServiceLoaderUtil.ensureParquetFileFormatOverwritten() - spark.read - .json(Seq("{\"col\": -1}", "{\"col\": 0}", "{\"col\": 1}", "{\"col\": 2}", "{\"col\": null}") - .toDS()) - .repartition(1) - .write - .mode("overwrite") - .parquet(ParquetFileFormatTest.locateResourcePath(parquetFile1)) - val path = ParquetFileFormatTest.locateResourcePath(parquetFile1) - val frame = spark.read.parquet(path) - val eplan = frame.queryExecution.executedPlan - assert(eplan.toString - .contains("Format: Parquet-Overwritten-By-Arrow")) - val scan = eplan.find(_.isInstanceOf[FileSourceScanExec]).get - val typeAssertions = scan.executeColumnar() - .flatMap(b => (0 until b.numCols()).map(b.column(_))) - .map(_.isInstanceOf[ArrowWritableColumnVector]) - .collect() - assert(typeAssertions.forall(p => p)) - } -} - -object ParquetFileFormatTest { - private def locateResourcePath(resource: String): String = { - classOf[ParquetFileFormatTest].getClassLoader.getResource("") - .getPath.concat(File.separator).concat(resource) - } -} diff --git a/arrow-data-source/pom.xml b/arrow-data-source/pom.xml deleted file mode 100644 index 3f72871f2dec..000000000000 --- a/arrow-data-source/pom.xml +++ /dev/null @@ -1,244 +0,0 @@ - - - com.intel.oap - native-sql-engine-parent - 1.2.0-snapshot - - - 4.0.0 - com.intel.oap - spark-arrow-datasource - OAP Project Spark Arrow Datasource - pom - 1.2.0-snapshot - 2008 - - common - standard - parquet - - - ${arrow.script.dir} - ${cpp_tests} - ${build_arrow} - ${static_arrow} - ${arrow_root} - - - - - scala-tools.org - Scala-Tools Maven2 Repository - http://scala-tools.org/repo-releases - - - - - - scala-tools.org - Scala-Tools Maven2 Repository - http://scala-tools.org/repo-releases - - - - - - javax.servlet - javax.servlet-api - 3.1.0 - - - org.apache.hadoop - hadoop-client - ${hadoop.version} - provided - - - org.apache.hadoop - hadoop-aws - ${hadoop.version} - - - com.fasterxml.jackson.core - jackson-core - - - com.fasterxml.jackson.core - jackson-annotations - - - com.fasterxml.jackson.core - jackson-databind - - - javax.servlet - servlet-api - - - com.sun.jersey - jersey-core - - - com.sun.jersey - jersey-json - - - com.sun.jersey - jersey-server - - - commons-httpclient - commons-httpcore - - - org.slf4j - slf4j-log4j12 - - - log4j - log4j - - - - - org.apache.httpcomponents - httpcore - 4.2 - - - org.scala-lang - scala-library - ${scala.version} - provided - - - junit - junit - 4.4 - test - - - org.apache.spark - spark-sql_${scala.binary.version} - provided - - - org.apache.spark - spark-core_${scala.binary.version} - test-jar - test - - - org.apache.spark - spark-catalyst_${scala.binary.version} - test-jar - test - - - org.apache.spark - spark-sql_${scala.binary.version} - test-jar - test - - - org.scalatest - scalatest_${scala.binary.version} - test - - - - - - - exec-maven-plugin - org.codehaus.mojo - 1.6.0 - false - - - Build arrow - generate-resources - - exec - - - bash - - ${script.dir}/build_arrow.sh - --tests=${datasource.cpp_tests} - --build_arrow=${datasource.build_arrow} - --static_arrow=${datasource.static_arrow} - --arrow_root=${datasource.arrow_root} - - - - - - - maven-clean-plugin - - - - ${script.dir}/build - - - - - - org.scalatest - scalatest-maven-plugin - - - - test - - - - - - org.scala-tools - maven-scala-plugin - - - - compile - testCompile - - process-sources - - - - ${scala.version} - - -target:jvm-1.5 - - - - - org.scalastyle - scalastyle-maven-plugin - 1.0.0 - - true - false - true - false - false - ${project.basedir}/src/main/scala - ${project.basedir}/src/test/scala - ${user.dir}/scalastyle-config.xml - ${project.basedir}/target/scalastyle-output.xml - ${project.build.sourceEncoding} - ${project.reporting.outputEncoding} - - - - - check - - - - - - - diff --git a/arrow-data-source/resource/arrowdatasource_validation.png b/arrow-data-source/resource/arrowdatasource_validation.png deleted file mode 100644 index c7583511a44a8cb785581bdd8191362572e3606c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 47609 zcmY(q30RVA*FEkT%G2oSlzPhPl$n(Sl`{@BnUzyU&QobQD>>k-oa&UNrIk4gq^3BN zbD}_MPC1Z@I3Q9^D5#_e3JCn4zW2P}|NGs~<%K9-Jlx)U?Y-Ap``HZ}i+#I~?3R*} z+IQ{h<(pDcJJqG6e$M~xXW%D$7sLeM;ivGM78j+e2NjoqC%^h$u)ZKARg<=7`_2yF z`L6p{ox`Q1{)qVT_0zPEO0<+zbLzFr7j8v+EQtFd{iPaK|%Jie+|V;<{y{go5g>ZC=cKKbO`#n=HjnMZX7#(?bn|JF6+MibN^1G zU;p|0pZiza`G5WN^Mmt$9X;B%jUDNC(_#Ht`?~TaNeJgMD{}&?n|&xQcdD>Db*sj) z+q#c07Th>_X!m~~UXO=)0w3+;N8U;keHD6INhs8*lgY@4#eAC15&bG9wdnrNMoQ}T z+i=bawBZiqJN(9LL(2+31;G0;uGJL5UL3{KP} ztlQ7l`hOP9wP7$X!lS4z@6qQ1@f8`@_p|CXQFc6J?td1J!4wvPN^EW=2q%jbgninr zk|qvYG|$XTa{-|Qgd)}JaRL9cm>w1Ixpy5Wl<0~Mw_IK0E*l-0ae95WBW&>wYwMb?Ejaqze|Il}F{s2; zuFBibE{$W5EKn_yuOCztk4PRd8Hx8duresub4!#?9bh*tJhT4YEli+6*ldQ0ND^TI zV-*`CPEs#135^-4rCK}QOS#HwY@++Oj1EnPVTw$>6#m`s)zJq%*BI8|38M+b;!mvQ zQ@@*-PelFamjO&vh0$0UgJm40oxq=dR%WZ*yY;OABI+I@O6pH=Bvmo1;vx_xKUYM} zQ|C3&aZ3&C#Sh^vT;7W=s-yN)xiKZ)3o+6bY_LDW0A04~7qXiU(fgkT-(&{B?;sV7 zmbMn=debqAnDo5E^JUPG?TGEIP$iQk%ay`qR zqEerk4z4F5&TFu=gCr@9-4=Cwis|wYk)tJTEtSVh_`dRVEB^w{ePf}8`@?}sNmLJS4YGb3F5qM54T0~1oWmBQS``}u><)HX8vwjN_ zC}&EPB-wzZ4s3DD_#_lcv{7~nV#*sMtAfPcubv~Y39Dk}+GYj4wS}lZYvJYZ-I-Nl zD<$>v1{hvz8&Ru@VGDjSy`^L_fiZps{z>X@v*S=qYU+Nbayb>&Yt-BtP$x3|litM)O9U@8AO?t;|-;X3Z>tC8m8B^IU9npaj)t8*TAuX~ zI3sC4PG|IdK{{5nwM9mw1!F2ijr{n9m8>+8Xjek@$Pm%Y$x)ZvwRU~NH>#_nzWsku zK>rK|&b;gCfq|+c7qb1-o0kf9-c(O`g&a4=*R|7>R9^W1zt_M!8`{!C|AE{Pd1#Ys zNWGgOJeV!ef%zyRU&hWg3vI1MG2WdxssWazKY)=zdTdQSBbr7A@J8T$;_3G><|D66 zgU3KkWTlq(%onN$l}nWok|AIxwGlU=DA|x^4;yy1>i;p1l$+1!bzAOtE&m-L5x}d`XbkTz_%Rc4yHz~46o#kxquK+1<|~< zPfvyj8=N(QkYp7GW9*C%#(+mg0A*RDNEC2Xl!pe1)!!Z*hVTf|g6FFo9{&J~+a zq)`GWvKrF?{>0I#Y08O(r~E2MZ4bAW#kqP+D&adMRl=8;nnrCjzziDnJ@(U%{fjd{ zvl5ZuT|Q4wM<>4ln?rsk|GP}>=<*3?;TwWT*Fb8+lZqB%>(*L+U9)$pt@s*u&$|5L zTteHA8260X+StGVNrvb4jm{ifaHkUayegk&A3s=OQ6GGtYx2FhjbVvNW=_}%b$64X zK->|6>A!D26JQMA9S2oUUp=m2dbQ0O`N?Cv4EiPkQ}Um++8p~fpGBrnPMlaaYJ$=C z*B=PiHMRIMQb>{E#;<)@S>X)?j_nUIHf1z*>qs_-7;G$nxlG0?t657oHIdO73|(jH`dAaS+C5}S)Y+RuG~>-66cfHdK7A?%(Sutij=NrV`#1E! zXW{S2y?2d75O{Je`+s-F5y*b4XTh>zkB(ljqJIc2YG@c8Nc+dNDP+gORBH@Jz-4J^ zX$cCP$`ziv=u(uDmr6=b(5y5Anxh47Lu4JSiIZ?jB;76g`ejRnyuD=MEBCymHrT%J zF%THAhNgx_ddNZV;e;n@`d>cyik4zXijomxN@A*LrA9!ahXl9YmHS2NZ{J$b&puLuj}? z^4d@JgM))T5hCtuHOt+5_B6ui+6&~dG1zn~KMzLm_w#eAE4j;_#%IY6jZQU(XKA^{ z&RwmtD$*Wv-;Q6)ag=6}uf6G*MMUeehU#F%_?TItLF!6iaKM|g^gsOjgPUBAG-A!n zR?3PwCd3(&YgG<()|tz{cmuISBX24H-nnHnWJ^{FNq7d4&w{|C{~(t%KcK7d`)3mf9F>95(lK z5jcQ8YcriOF)<4IK|O-BIZjyJM%h?iU7yMB%D& zID>FDj>qCKNKkdP;Dahv8yO+Edx3!!*?baW@Q5)l4coe&y!BkoCmG&(y3S`HR>Vbw zOsvB%UAhF6Op&~~(Alo!Okf9E!sb8c>IF_mtNKzKs|-SP^$hwH&IHp5rt0dxUC$4v z$b%Aj6TZCViExpC`}y>*WvB+G#nXPvhlF*XP zT%hPmut7CMEEh{+X4q+H5Gow!6;7MbDNOo;*HMMFlHVtAha8iB;4F`?fn zsk_Td|C8zBs+A2HV7vd8x;_c$E#kXT3MG=iK3`jlEwL7MfByzSEEY(s#GkQ$Lnce_ zlu_PTt|L{taM#M97~s$sQw?-=TP0#4h_KcE?i%*F3Z%L`fap~3OF|*Gy{{)Gic^*5 ze?-XrZ|yj^rwOND#UqhFkJz47K_dtbl?}06CPKvWRM4-KR~ZYL`fwC?`qZi3zCI02 zO{bc|#b!oVN?KrH1=2XmCQskGUK0c&Ml8LaE{&{Pjqc#t5;Xe3`t!X9u_o~w^O6Kj zo%y*r6?DInXG`PP^n2`Q`kR> zF`(*oXsN%&Z$oBVHC=}i>)!gVcxLttj0O)IpM^n@IV1LM$~^h9BC1GoZ1E7$ctZc$sr#yg*Gtfz*n2RNfv;z6-_ZTF!ytflRmfQ70p;Nkn z$z%eD>Df*Imgw=Efj6fR5+S{=DQMytn5-Z>mrZq{y5z$2JaC~wo(a8qz6%f`Esum2 zZqrO%^+mHP&_sTP=U|SGYy7c|N#m&{3d<3zG#Lh;gY`Duxt$U*?C2mYkGuy~~x$vbae7Z2hH2~WfSwV{P$F22@ zY0yyydag>S-U%JLrjx53eHd@XUh-$mUUlN=KZCq2;kxr+Vp(=1r0Z{#8O*HwF~%%1 zqCyfEh!d8LthravQTlh5*%sHA5IxbsgTpVR`_ECM^o-3C;+US@C9Egj>Q)1;cwwot zUy#R1Icve0DdwxC&?}I!Yu+yZy;vhlSJ4et`|mp@9H}WmS{qM%qVZH4T2wBtio@N{ zz{_iIVyZV(;c+fX`GukY@Ex?HtM;m=aC)+1aQi$nA}S%E&0ih9d*sT>MC%L2s0rtb z(&c(HrL&myO}8J}_l1H%>(&hV$^HaQJ?;Z(6|02hnr>x8{P@eGX(r!4BACZY)NRGC zG5kpU=lQ;@PW37PcntLQlSxjFSPwj$m07yNDLA5lO_sUC<#HXX-43ApwcS2^p-?DO zQ<^@?Ceb&@UM{)^(bJa|&OQkotA1u1WB{8!q@!>|dyITb#Vp=Qb9mt4q>C%e4F)96 zt~F72WXSgW`)eO+yc5l~8IXjTwWjG<==0CIEeWoB%e*$LQ!uF>6Zlm@!TJJ6-6gNO z(aj7I!Y6xU_rC&wHu`JCwB>+2slFliRT0B&D}WFks0F#LdU)?R5h2+trF4=R!WC8sC4v6cfK&-Uhlh zLM{|pmptQle|;JA&o6zQKm8$XrC9x?me3{CiOSM`(~Ia7_8Dv9G6kYQy8B2J7P;r% zx-B)o_3OyHzCpHCo!L+!>3=k zzn}l$UwEquY2@K!jmW7BO5hU`;+n9Y0)gk{zK!R6R)Cd4f!8y?H+bGZF@LI<+6=;f z{yR}CIx}7CEaors16D|!diG(s`}?GKMq{V$#D1SEky3}BlL(siIjbjEE1K7;X-yZ_TQ7zcV{Yf*&ymSspHC zk5MR#4KA{#RVvB;cC4h;c;xvd|0aBCXFSabD?54ROYoG@0ahagzzplem4*wCR1y~p z$QaZS?XI#%r)xGoJv-Q!1w!ZPU~pc-9JTa8t;FUK{+m64^6t~?b~KB+G+&5^Xi6sV zTw+B0y0>HGn|j?Ikmt9ii^#W=*2u#8P}ZJ2fR7n{>=*pH#EiqwudMVp&#*=!>48Wj zQmfl>{;A0Q1GhY+OdQMt#l86=SeS0_ov~ehf{SErO*PjB*BF{WA!c`7$`N}rReqy( zoUgQJ>SC7*jl6z}0vA>(*wT)AzC+U3>W7)4b*HMAF_D#-rcf5yzw-iT?$@cMr;Q=D zV+`UsL{wq@5)J2CHqyI~Gi;F>AH34MKh%v{oN7#u(#;$=5RQ(4wfNgd#rL_AUMS|E z@H{uhrqW8)Or1?%6@_O^r<*j#Ci@w!Rt?x(9Mvg;xAYF4Lq`WD2`u?Kd&dft4_@r)(XypPxN;*%V5X+8rd9o5It&*yNLM>E91 zhN^EG>Of&FlpD!BX~Ei#U_x=EOjv3j*x7Zd!DXc^{O%w2EkF&f|E!E_x8FFCJJ;iw zhAL9lrm|rpcmZKCMOaP%av;FHY~b^6LRJ1WIv3pgT5aZFhS@WJAQ6+yg#9TX!uKHy z6@Z>urc}A|a5HzH&=Ed|VT%N-9ZA{gH8Wgn8M1h=SdKLflwa+S_waB4Nc;%@eao4V zKpr|x`NNxVOz&w`ip=Xg{ow5ml`9nrcBW?n$S{a_%bRv{Z%}8ltarC~Eywo?jJ`2n zlRu<>VjErbqnCg7$Y$_JsKC=ZKz%=0SDcj8E%`cbDPb+QAcDc6Z?$}S%xQR+9oNn| zJ{|6=qJ;xT=US$q-EmK6YL($h!VYH+)!0R2_xyf#uwA%dL|LuTZ0mvHUJemAJ`^$_ z?1x#z(FZN*p{}a5xQv z$=pb-L+&P$=~Om3U->HXU%m%EOlC~g#SJ=hkbb?-3iHGNzSw-{0qFCZ;ZI2!KOYWD zvH9m+kxG;GyHd4n{i*ML6InMr>JK%(P&mCK?Mm^spW5YA|7uEz2jTWYL<0MH<#NuR z%wVHXn&xQD5K4eoA&tl*ta5y{M&oo~BQ*yVIKJiFI$wfl2^HIffE)Jr*em*&T4>2{ zCJK1@%e?^&OM5aC+`}8}vG5jtJIa=h@};lLjo{@+UBn5FrqY*67gFLJLAga^11Q|v zMOLncPfDy|AJ4;dtJ(jrMpw5`I|NZDczF^4r2dEu`Ok{tLs+#OJQPVtc+Z5cnk&I zGLNPwTA*y!MM`TQ`JkQ29E{Goip6M$7N0@dJ}dTZD^YFm6iAk*33I|a1;d}OMZ{}= z*11rkC83DAyU5J%5sCBzJe6ODt|+On=owlKj#H|d5~~zMHBVjwUG?Hkfoz}ho2tnn z&W&HuKG?dFez*7WoM$HjQ>@kXL1kE_VO#lG9K|YFBO3gdLU*%Tc_6zC>{-z zl@%9^ji>ul$`z%iW(89IM98#-*Z7XSen`3j;nepPWVr&t<7I{(JY&l%h+;1>x3mTU z^!ezqnESODZK@g!F2AC3oAcl{JL8u?`2hzDE8Q(W+k_O40JCF?5M0eicbwqu7RJ2M z(bk#r#%PCYfh+<%eP!6CTHJGEXLaX0|pqQ*lswky4?@ zW4Ud+;}^ww3cQRjVKs%(55$(oeFkI^DgFGsD90!bIJizbZm!3detOg0N?o_1_uxs_ zyt$1isvIUwkw4bp>r!2MFVQgs$)D?drHbyQmnnx;9|R~6?)j>!@T_C#J9j%x7j&K& zcgLTI8vG;7EYzqEd9_E_G;Rj#$Pd zp@%Z{Kbz~wS50jVpzv`7aqDf=_S`w{NKqjvGdzhGkGpDe$g_dBYNQRhQ|>e{(Zb5m zxMw@TN!CwLrO48WmmNpwG#}GPP>iiC5^`7)Fw_nHzRPW_yh0f=c52r% z8QThnDnYNU6?5a$LCC=1pd7k5Vzr3m6W|afJ+{(N;fPHXamRp2j~6V__bQyAC<|s) zASaTuf0sFT4kZEll~3wtMkd(+5#!nz6a%~#A*5oG_6chcG?g?3gAgEh%oOGMuY3SH zT>wKQ;?1C%%s=0aFOm@A%O2B(lask=5beb6wL4zlWlZ0ILuWj>`+5w5#(@M>+V*IE zU|MmY z`K3TF3Ou2ra+z4NWVqr+H2H+52ctxLWuf2Zupu;Ua5PX;7Fw* z{!mSFN zZo7F7zA-uyEI#rSF~}WC!{uqo2W;kzrADhDGtrT;NI+VP`O`*-%=|n*^Ou$A>)+su zf)(QXN)l)9V zQ#UH-XPis%e~ItNE5t&TO%RUZW>xx#s7s!^92-BU`5W;Ug6%h+M9-OqMil^+Bv%lO z#q?42@VeX;nX%RJsO@^Qd%X_)p*@8cE)*xfZTA!Cdv4sedCtrUM@mVF+KfPzYl~JN zzxi18X$&aV9u-=J2dHkTN&rN7gEk@$+R9oxy1LFywQebs@%1b#ImPk!rHl}mYwQ3MuDUrX`hki zRfggke!poIZkokTD^|gEzM$kGpw! zd>J>`4EKIM-(+y5v3R84i~4#(6t6l&aRYALL|Oi z4y3$yYpCX#4ARx8Zi#%S*;FFFsood~RX1XEP5->s5HS5jU`$6KECd87LL;w^S09A% z(95KMUN!6kh5TMuBByf5QYP_urmTHrsrfoV+4s5!3FUGl@#s1wu9iSsOX$nN?)L44 zgQs!0M9*6)7CK zM0@(uvM6I^J+EOaHpU~wPBbx)OZ?`Aar0Dsz;@kV{XNy=wGx_VtmJsw@tG(i?(JP$ z_iSP$W9f+;Z6UH7X}qx8`0VJ=^knSksnZpmMpE=B3hWK&S^MscM-^XGSyjQ-0X3}y zcC*#NIECd_r^Xw5 zw?w{u)t0zs3RMSMWJr;K=(cFc2wrvG+kvzt%FjoAJz+FhU4xEn-f`jinSZ#9jH z+^1fucGVJWQdcs~1eB2k3?Zms`?lTC9Lri;4*R>WL1$cX6~3tG(mtQgpMXMPD9lo;uK7hF|e{Y|{2Q=&Iti zA$!Z>$>B-hFTXgpKeqNNg$0!wtgz$hH`k`;mYvn~4y0YF1bR}a`)n2Ug)q|xqEV!- zm9QKbo|#Bse^7v|#M<@`h-d0!{=AMnqqmNvh2eZ}Viqm_{vkwyW0-6!@T@`9Y!q(5 zh@n`pZ=KWZ+y^uuByak}o%BA4 zbGyBB7EjhogW&6pf8ZVB?p`S_p-)ZSZZabHlx*jy@zK5H@lTH6K!4T%Y`xQ>C}SoN zb-3z!aDUBcQNVcPRdEvb1LQ#|BV_ssdqc6o!qOZQTXqdOx3Q)t>HjQXOnMWP$SI+n0FUSQAkHSFQ#xq;@_)7o0^KB`Sk4B zK}or-{BRr8W%6khJ)sGQbCI^q^+U{d?OQ7d0OS**&>c4zp`5r?@B?z_5^sWJo%=OBx9@R&P_4EQ;GyW$A8e3If z9?Kbc>{VieEV0fL3>vfV-op!eRGFF>dhrkT-y$_T?e8kB0WC)m%@9g;YG}AaE#HII zD%Hq!X$tz>`TFr#phk#0u`ul!;0zfDxWCsU11RMr5~EhTEZd)u?OpduRO0=U+$jOB3FJPWra2IH0>B2zNz%Z3@m}qNJ)Q zG!J)l?582|%l;*wgit}M;al8;Ki4lb$9~g>SKrtYG28qbk`5$xgmgR8u5_O@7m}SD zKCg6z<=A{ScozT8LuW^e)%}vySmJ#lH!{lopjU3<{?Am8K<0hCc50G|TWnC1@g(`+ zTbZXvrWF*l4Pjp?6kbu7_nv#>ciK}56IQ&SVYJ3bZhB0lurcA>A0w~mKgBRWqKG`+ z%%aN8R}&#;BYh31c*Y>jdUX1ojgNb=BZgE#KD7-S4=xS46?|F9GX`7cf}tpLhYy(f z)aR1R5czwxs?(&Vsm&HcDf#&WaiZ|G#A^QK`1)RI3@t7=r9;(Y8M$sY541F;J=5D| zL*98^*^TouiHv*|h8rNes$(#AXYzP5=!3dKZieqI7tiq)Cy z>{BI5{}r-KjaE{8taQK1Wa}ah-LtS^emcgOEBV-LBe*jj?7wORy3@V%>AabjRQU^? zo$9zhwi)2Z|F|hjdT)Figzq+1E@)Vq?3|U}T&A`19{n7yMtJ73Jl^AvF!#9i78ECc%Ba*|vUD0>BjpGY4Mn>a!%Tpg>mkMK;K0k-VY52UJ z`u#I4KUZss=kwCbL|l5chV*fFYMbfUDMQaQ?!8id34e-WBlaaLDhTulq?%yG_4fH0vEvKEqwt^SU)H^rPrlz>!6eP02v8?6Dqf!^%K| z8Z+7CR2=i+c0-HDFRO|1Hht<&0xYD69*p^*ee#qokN)n+*~KJ^{yhipHH zE+7!%Sxr84`;}I-s|(k4_V3X&dn&0YhmB+%Us&`t7uz4OgnO?gb&lC4fz@sBJ{LWA zHL=~s%e0ejmGnDrv~`)P4@f|hJ)(OjCR9@bmHQd;|t9;Hl=gi+5TPm ztok4#UKX@7x2a&*CcaYt8}oMjA+eR((7Ekq8-mWeW$^f;EbwDM%Zug2twRbhSvb9x#9LEzi#f?H)H4_~a0q$s&KGr)Y3`)K;VH3iahSd^kU;9BF9C$z40DT#=Yyps= z^UmG-yh>{U$Y{?$ix(Dr)^=HK=y-klZRl&8w8|DBre|g~U5!6+F!&Sh2iJYcYg8JF z;MJ*@Sn-8}NMk_mXf7oIyd@x|iUngt3TwZLc&;@D_EZKw<2|*_=fJBsfb1@>6GieRlfNy}rq(aAAw-?fXLy6S|J;F0w`<+goI0%}qA zUh4S-;b-d5Nl>qVP={Z#DmDr2YJRU3_ice~)!|{!S?Uj3b3FGR#9UT~*FGosXx~jc zvcI}fY&e*k<+$;nRc~$WWd&FZa5A$lW)XlX_2z^WcI2~F63TM6CP;_L! zjtNUv8T{Ta?+fB^I|qe4*>X6s9&dqW9U@7&1UL@-Zr$l>1{133ke)+Ka@P3MM=lgd zQzV$KeAb-FZ~mGq;W9qiJJ;t*j90i8O$jw7FhY3dOn=(`T!gTtTA>U&Fo_T!HBI;? z$NBPM{64-|)GipC<69bknhnh0uZPT3Y%X;k=N8HZ+q!!I5=Ip{_U-qIi1>iP<(vndlFLE|5Gt0+lVPMZUyA|-*QYvXbgQB-Af|MFpzt#5RRnJEN9tg(yl zgjJQ5IlsUCfN*#OP;~oIIxXepB$5@qX6i`do*x_k_vb@bFflN1E99BE8b5C)6&J+UXyUqiSJH;Ww?ApJwLf?X_?r#p1JhjrE1AB}Q*&o|cnsH)GcK0QMh80z&x+Sx3F&A&=>G*8Wymwd?LJ57doO zVf!cWQLnDEUL^7^FIXb&esqSR*<*TRZ{RsKW6$3Pj*!2_iZ~x39|eL++PazDGb`kz z4C3U86kgshtEy><{j_C+vUI^4{gsXZ&mPw&_?5&XEt+%^Ea>S*q67PFYZzVS&$V4Z zW*$b4ZM4ORI0t&4f^~ z)vXc(iDB5?u61TfO4v_j2c`DjvU&dp>e}Hca<8@Ylo2@MM%#l!SIee&HoqV~eC=7t;YLbBuqXwj<g6t~h zD#N2Z#bb*gvP1qG=G)dApA_npE+SowEqE>63aQGr6yt!XTXc6It{C{r9iFIK;^MRk z-bVboe`Dsz=7%y=hw@BICZK?S86G}nrlsYM)71+8$87c#;D!JgopV0@*?_v~)*NPTj)F8dF!_hg?XQOLIt+)+mazYopD0bZhP zz=RgXW!jHS!T>IsC=szYa7#-?Km&*aSgp!RN0r#CsJ7#B!4tKey7mrO@!jR|vxRu& z_u7wknwofh!WslQd3s_AtRif^BjV(=3+Fmf2UZo>GA+y2gqPe5WUaUgch3)lkj3o{ zyZJf}BiC(uV0>_4F0NnL5yn}>tc$xU0>CbypgwetX&m>gulpKu@y*`KFf#sG9CgK2 zdVav=1jq?GEA%)i{?NnDp@GCwa;1;QoQnxvz;j+vo~Z=38=EgJ?2xt-VjGOr9HO zOj>e8bLs?YcLJyOvFnr$CPeub25M9`(utj;Blig&@H@^C$+1XJC@lT38iq`#y~LFH zaF@#_*vk_@cr=Rf9@FCJ=NK%A-(IJ&4I4NwYt^hYu&oeQ2FYq4@HGu z-CY?m%xasNj%;VzL`!dYX2`g6jc<4juGRYi=PdQb#}aKfe2i`9U<1?Z@scHybRmFy zrYwC+vpn%kKGrpO2kxFkw{4D7|?`x%xVpHnvgvp2SI3XWhFzDCMQ z6AyY`T_#_PUmS)^nZWpfHg#GbQF~RC+SqD@efMOfnB2Q`Mk7w}a9hTqdi(Z0aeEo_ z_cG&2+%3{KnKjz$vUD?3b$?c#0gOT>w{iw&JVnf`N78+q*FTI3`pfDtJU4SolM57b z&;Kj{AuJPSU*wmc=C6T+Ab>2iHG)1D5w_8*nRuX7KH-*5``(>j@ekQ>JyFGi7 zH@MjyYM!wORaAy!71kKAjzLpn15I?6X$(_0)UZ?j;)H99o3`Yya!AYiZy&lb?wZ!s zv#L8=F41PL@AYqtKKpbwop_%1>2x3^N!k@WdCL+m$tjCbGyVkGX%Nz2pdz>-j&2=q z;{rc$tf1eN9BH+WtPwRXkL*&Ks;aNB?5!;UIUS|4r~caibo`;oJAkoG51xdLCY~g` zutha5rVq%<-mC3644p5%<@AM8fcW){C2@LAr_kf&9wIK95!s{6y;5~Cr)nSPrAYEME8|}{nc%$d3)$prHgx@_34O%A)1N7G71Y+dy!C* z9V6U&z_q1l=3|b6M;?P3fpHgVoLR0yFwbQlb`<8miXSr6#GP3h}ku=T&3< zV2&6N*dnew7OwFS@FpdFLU*|GQlRmf=SZEEuf12izG*3C1G1*Cb#pYWT28|TdGtsh z`I`2qfgrcesW8ZzzE1WX>ytU$SS+m64~ee(W?WGmeRs16Le>a3h#wQsb|Sx) zO3u$}ySezo#;lRI6!bjZ+1fpa+5?s-6o=z+dk5v_Db{hm_2(SJgrT*^t(W~iDFgLH zfcL1@K52?}u8z50FA5AoSkLqG!nV^n8r3QK0qlip;~RD+whaf+5=ZuH$(r-#+wR8t z?ZvKmS;u>}?(bVGuO!xviYiR?EEXz1N41J@c8)b*SgP1PXlNz3d}kfyMLIPTbYsZI zf$}D2kJ)6*daX+oFBck13zT+Dfw?}!@vN1C1KA-SrCXnl!?N9yYD<`I7QS`Vs=`6TC#BpJjFWbVdd z(IvoIT#S#~B#3xa2){Rt+j|*ce-KRasUfo<2s(i`@gdZ^lX)r0jMu&gbj#5Z@Fj2g zLgFq~p7fEElS`KIjf`tUYHMip^+bE<#x2)a02Y;~nNMpoGc$lc)IRlNbp~nOOg*!c#WnII~l;eAn>^J`yalu-p zz#JS;L;CuLU|&xe#h~pymR=rs6f&I15SGkRlc&!iRB9P>8S!z&qBwqrl_rILyUzxB zO5@h1T5i56qO(y)i_YS9;v9Kda|7WN*NwOk5mX12l#G;OP{o-2{)TR>-M0a zPLa&5os8-$wFWh<1+{nA;;~7x)KmjC#@Z_qJnquG{Np{oOY)EY9Ikw9?y2;K)WmuC zGU~h}>j)O-R$6tb`eyAoKk8mrd*MoV-kf$R#b)tsZ*T+Mzg*nT=bm4ax&2K(sk)v; zzd%(5^<`}00>B;)*+t-flz^h1u`ffg$}LZ8Oicy$A|;=C`OVk@o^(W8y|@)(cP z=oBP;h0ekhR?YGr*b6> z28%2_exaV_M~n&MtdG_8W2cJK&>Mo`T2+g_1yjg5Q*VNi>-A!rZ>^2I?10}>m3`ll zudyrdM|E5L*X-==!YcMRfV&_Z1_;!|H$UlAnMH#Op8GTnN!wpBb6*^}?UJI4e9izj z)Ytzzf8*7T1T8!zSpk*#Yctx?D~p|ywKQ*nfLUP_?$hEpt>sCuitpM@#W*E=Yk&O1 zkFFelh6`{mc(DMIrexgcH{`mS_5}|N3e0pkkHgLJ!0bSc_dPGQN!$D_$D!Ky@Bh-W zoYn_q*NnX3+h2W=SzBPXLF_%0PinAIk&}J!y8i<0}6C+*A>2AjfGrd&p31dluMN(^9W412i$}?qJf}=5(Sgdl! zvQv(Vr0SnyZY^R{z5LgeR3v`1wb9kAJpunszA0IX6~rjnc&I#C18FCm_Ne0x0Luc*4OLm z=~<)AtbGmA%)rNWbi!#kxMZsv@~K0bY}+}It5;Wh^Kp7&VLH&>nQl&nrVc$wNhm%`cbMqgH)LiSF^)uJwym>geqpr>O5= zlQ^8Le8bR-O^0R=p|@-6>W~WgG9K^$0%S{Eq1R@;KeY1+`Sc@?__fCXxpEQv`u7J= z4OPRyLAyf>A!pj6PiFia2xjeY^Bn{ApmK-}(Qd*4@_?6Z=Jp^wIihEZyla;_ zR6XIk-8r(lO}-N_Z3cuxSDH!|3Ptc6D)jD7O*X#>Sb>}9iNaw^5OZ#Ets8MHGjXkx z;p~h}P7=6bFlh0B(2_2%QWpWkRb#PqLV_?5Yn?dr3~jcUUl}3TI5qUmLM7o1%ItdO z$y*KTehp2 z0kqosfYI(59h8S9Fj|yz=FY?2+2Zh?mlm1Y8NPPU?gWj${}x5G9P=m7ra{C2gJvz6 zLQOj~1!$<>TnIK)C@BuNtM<;Zn)o^R6Z(un_RWQmE$w&vxKh>g?w%+5s8jK^F-@(c zf$}DDaGqIZ^QiJYE#^r$>zfSb-o1MycU(cMS5)WzmQH2)Uj`vP z)Ur2OS+=O{rX=M*SQ3mAIC1*}pqX1##(n+xlP^t1PAk^(d4OQUpSaZt^&0W_CyJNr zY||8tj&+r>M~(}o-rwmcw#?cf=o%ORvxUOP^5WxNsxv@I}1HJBAhe5UAur#LrlAfj>S>7Uo6PHJb|%3<`HoN zRfxDzb*KH5byHJI`B>Ahr`BzL+QVDA?V|AeNE5q0e?25z6-KQuzd605tU)1R|MF5} z>Q{XAZsJXY43KvFN4K!606~X z#%mmbgLy!7WN$J%K#Ef@(#)}b%(PEBqs#dFWFVurw>OQl?_yo=5^Uhta2E4!|Ek~* zkevXJQEU6tXCjSEFd5)%0+V1{mPwnL{N{KGc#w^gd6uaYCnZ1PcO$&RYHN7_9U&D}$dRJ%1lA0~m$i z9DuY%`0}DPBvMFFj**ySRlkh#isR-XfzJTo^0RJA6Y??un=9!Vk6^U9gCRD|B8)yB z{7Q%{&-2xB0Xtmn`9_oj*pa8Ip?fZOsj!ZB>%2U4uOa>@6z#mU;z>Y7rT)!-492l0hspVaI6OpB*J47JA=gy2+CAU1q>(@F>t(GP@R>4c7+V+ zn;3R~5)A}^SbQ!!WJ5HLGl4I?-T}o|b+xUla9K|QfN)+G14K`N+Xa+S=N@e|>+)YEpI@t8igCkS-7&9`0raSTBI81~6f=-l!jN0rg%3x3eRw z%x1v&w7qSW6y0T2@-S}zO24`}3{9MbQ$;+#JOh)Lzzp7P(z70tLMlQS8Jym+{M6l< zS%{^3)&-KMgohOg+v<_GZhcXo*+;}q0!;_r&r$<+R5N_6W+bz?Sg_%($QHBr2^O4V zEz>to1%@60Q1Afo9q4hQI`pRUe6+-ij(9br@@$DY!|h;5a)Utp832a^Wq439tdAb9 zkB2I}-7*u?G&D4X!45OULjg$JCXn}^U}XR~;pHI!o_$1oFgQ`> z&F_724$we>(K0MQ1WL$2KGOlPtw3@GjV*qeSu=NTlKqAreBNi{MR_`&95+#HyEdHD z7CLv)XETh12mt&b9Jrw8=~Yb()CLe|Jx-Pq0j1mp5G<`7V(w{n@)=6aF&s9(li#L- z;b5!$5%$|SaIXWKOy&}0oWF4viEocQ_;=}PmdoS#ArX8Xf&ZhifV=`|ycY5r7@O+o z1p$Mak*R4U=~>6z#nIZ8GpQK3_{r=YP1skY+j#oz!|g`^*yaGB(y*KMy8uolH&<|H zB;&$yvOMRd=K&B5VwIQcdT!P3US(;+!dCP=VZ*>=x19sDCA}}U%>tBNPMcGlyJnUE zo7ipUYWGygU|zQ6bGKFx@VCFFkGAT!QK;TzzT@!BOp2kV%*=(~$5?G?-oGupl!ok& zV+2Gj4Za6FviS-JNicBfC`JME?0WVE>+D#)_>B?=z~L?VdND9rVM>UPzcC55111%~ zUje24ZGf6kUankdtl6P(nf9CfI6+=;2nv#{MBcyk-ko6Ox{JV_s|W>=nv&Amkqcy~ z0fcm(`0yTpW}dAyTl{eQ9+O^E;Q)Od!1V=IiEf|UVcDlh!C!CDC0>U;zz{k46Xt3n++G@b;j;XzK!`q z=TQ8*um3Z*mwnsM>_tRu{o3n&wii@tcq zZC&{5Jq)OPjVu(OazKCEoH})gz}#YUR^N7PE-$m^I4s>J<*3q8Tw&D!OzAs3fC-f+ zbDvzNYpw!sf7{9O$-nwfJfis$f)nKc=lBoi{1uD7sy={phMw$MK==4o69$F1Ps@w@u2W zD%=f!1MMC!_t!03=0za7yY)Vu3<@G0J1D99^hy8s>EVC?_XU7kI|Yz#K-HkSsmT$ycx6ZgEHd_=adSFyH^xhv9hK0se;?4v=bq76M&i zG5?;4Eljvx5f6==(2jt%xikOsuo{qSfKf9EF}wF4lM77mB|IVye3&@j^>9cA=tSWs z58vAQ_3j@e(?5Twtpk0bFzQ^CLCr?|U9DkqL$@Q+ua_Q8GI$L?1hG*wd=1Kj_;R9^X4BnyV!X?lt3xkatp#3a}3CY6`6k#Z;cIKnk8=UdDiW|qb(R@+K!B)!`1Bbcw@Z0)66k)?U%UtJVQ8wWYbGem^nmo+KOJ%KZrvhU z1=u>%hhTV;u>}4-EfzFM_18(``o>tjawa4s<2jkY=xiD6@$k0J%ajP??^=_Cfyx zXJ?Q9sk59n8!aHdeX)A@^LyaT2z+C{t1tcZuE1MvdwT`PC+`2<;AZ>wsa@7wnsptp?>(F#A~B{pG&p<>iEg1i)&|0EsvL!O7TMlDkQNzQ&@MoYpx& zB|~rX#1Vy@nsUqdB_zV{vsnXA5Mz!4PN z6|XW&RY`Zec8PbOooI%2bKcKTnJTa=X==)v0r+I3_sPxaj|_5k*_Hq2DM#9?>w7K# zd~02?$%ZUb_z%Al~P?D!Fxc9F(59=OLo0EAv~TO*7Y+7=uCEX z2xmt9`6k)!*80W>T$BtklG@z_k2h~%cI(vOQi3Hrm^${%j=hOGC*Gd7L6kfRQ zMFaecY*z|k5nLTnE>}I!t_~@FWxys}9aJy=uf1Kc3_RZ+nn}#T=;12jgOkrv-L51Qyfx1YZ%nWM2{%6SZ681s9WB9wmg4FSL%8?AL2L0gEayg6cvZw zs>z1s=4TQHWM>;COyeP--%q8j(+snz*(H!|0$ua&kOrFy> znA*xsW9F9N8ls4eyYKmZ{~NtvAv;t;Q-yUFn6op?HJ--EjliScBk9U#W#As}aVd?n zY23&$%9ZNWr0it$z$9gUUON;9L-?VOV~_{ zGs-4#Iy-;^b~vA*i-iQ&zB`tp)~rBtgU=a6Z7kK^5|dvczsv+7wR)7-9jYsPobez& zr4cbua(~0v3N+q}`J68JUlC={D6rN7-yEghB4bbD8DzTrmkAf!)NH2LU_OPB@$~qZz8{DX$f1PFBP(vdfKhlbJNPG$~ z!N1$z7*h4s7^`uSn}6Cky-K!W(Ck+XPx0-l8d_bIdnxo3O;Oi6Pekh`0EkHof14uB zoaiD{p||q&f+-By%L#e?L=uh9Q$o*wnarpIfrBAWg)B@Y@H&b{7G>ZIXXLfy8aYiF zQK3$S8za#zX7oK=uX0MCh3i2lEpp2aJc9TAW<1gO=h9zThV2=O8iQ)T1{Oh+Uo+p{ zDD{@Gb!paT=OeCYL+ay&Q=W}k zb^abtIob37X8g{{DJtej!WG_6JwY%eZ7Lz#{J1lWB zV7m6XyK3)aFX!x9@U%Si#5kz#Fz z?PpMrv0j{97L6R8Usk+xE(ZJ+DVNzrTe%QeK?8|+_zA6xYlbJ?boODtbz9k2r%?;Z4d#C6*%3_lJ61y!nHw5kNwllSg!B_1B6hoagwiDL zc?6$M2$enB2*gM?;@g}ytde4!fHga6dx1{_bU7gn&0PmNEJO~=TOd*qv7YJ(Tjz`F z$V&_3Wf60$$9R2P77x(@+uNW%W-ds62z!IdhYJdi8nQeZ)hrI65 zg#HnM(~JxpWtG3tuZ*%OXpDhY<~Crha8FX?LYhSJX_BC5!K&iGR7G>A-i(&k*ypb2 z>J#Eq#YF~9W`&N-Fmr|hR)>Z8_Gr7!63am$t3#<+!Cd9j)euBLjNDr8GvsfAA?S(HpQAM#;vYiRp*RD}*_}Ld~mvQ*{j|X2p zM0R;A2`~e_Zf^Xr zkV~*wpEq%eCu_=}n!fd+07ur_Nd7P@z6M&!J?kL?i}Rz-Z8lgpD@mw>_hw?|1)Zp~ z^PS5`5+{N`(Lo~|JPAs|>?hD_!ZXVBSNlt8&%eFgl5bY#S0}N%QQM$>6z$U_OAE5!Lq){GjRLO zJ(hEUPES+KHi5k_$K!RX3H%jZUUnGPth1_rB9;SdG*UaBucb1&VMN-)8BYP{1W_JD zv4YivZzyK%&z!SM%@ZsS*S3;}sro0Y{leFK_kJU-)HqU4SbuGCZPxonZd+V>c#>v( zzxZ8loeKd33+$0iSdt^=WZWN#icL=GF-_w10~Q-Ra1*ZuFmJxNa=@}I_)t{~d>w8@ zu>U)cCFfz+n>Wg^YUw$q#x>gy%n9~kyE%bP=d>;~N)G-4j_w_vX2>3D*zPoW0oIw8 z`gx&QA{*_f?BN1G>npZD%fyWe7Sa>cK+nfzR?uAm@s%M>Rks^swC2-NWzl~#wJOcq zBh!sD&Uw=Ebk2!g9Z~qPw)oS~<=JoFcGPBmEUvVq*7u(=&$2uczrrYCdvL9|m>4cY zc`M)8#464-Mo-E~wYJzKF@VE0P&`Sgo;o6#RaehPsObNv;G>h(MmD>;ctVt z>PO?=(I54{Qf>Wl_qRObz>nZ6#P)2`TE@ljRI!-q2}l9)1|LT)+c@W)60A^0SQ+61 z#}ni!HlZdbT5XK^IS$UD%XsgTJz2TETBp>VQ)ZY^lsGUEOg=IQ5x%#VTL(s zZnuu}$Uh|djpP^Evvr|e`D7Ke4v|aZKg;hok`7qane^T-ewy=X<9~TDU?bIDo=DFnl*UiMLF?Im`h|HSBprOe z4vhe z2un4|mYlSn$8@&x{y{&xgM+Rr$7CL0`V%#NtWnPUWVVKWQkxH+yKArS$bl?o?sugY z=nKzit=prg=li;2#-kW>CX;eM38PE4c+V+mME54{wXGwLIeP6hG!{!AS7yV`(J)_X z?WKMtmYVaIN|&STgQ0$wJDeL0oMp!oAQN;pXF8+aRZukj*!6BEkC^^`j93mtQw#3m zW1nY~9cg>a37hzQAYBnEL%*Jarr&Enef@l{%Ikxvn2?4=J~4{j;N?$SzFEg|1Hfow ztyyoxpNhGiGVqFjlKc{SwtlH2eq!ikBEtQTv-ym?Z|lgR`pUs;!demHn#LWlEqY`% z)i|emvp1S~jrL1d(o^CvmJ4Y5>aJ|Tkg*8|!UsEn>o~=>MhedIxM~eA1)O3oRBof1 zu4XINce7!fkwE+X_4$*dPgk-fBKS)r&-E39R~MXY9TkOl7Rg*_DPC{m2EC)e`egZ9 zv1{i~b(V(RK;;~-n;N1B{DH|Uc%%-`fcZ;q-|e?X7!^FU;M zGU&>#{A8enE-jmEyO+}Ik+UWx-H&A^iV86>gLL7}4OcHx^BStwh>p;_Vk}u_*(U{s zO6FliI_XF+f9!qYe(N=o6%*(M=@ZzPRc!Rye09A@UqPw*#qdXcLz6@#hWmq2wJMfZ zC|5)`3DxoB)03(hCwuw+BoK-#Ff1vqzN)_$+tv9Cg(HDsmd+gK^;34!@=?x{;+36l zmchw`u)TfKu)V2ofGA$eLqKOzdXaI+gYgg_o){rdAnU-;H{4HocWZdqcyi5GSs4^} z8qqtsz)Huv1aVliTzgn{7$=Ozw|Xd})kcC?dkN==^oyt)UNcy1{?$G^vd39#h>?>W zM*I1ZiGD&vZhi|ZTgQEVX7uE9c4}suGW8IK?;ZSA{(CG&jEr=1s+tFybwt9;V1fMH zeAT8GVqGt&xt>n&Ng~6YT*t2myowbp4VSxv2T3b~_(1bMX3N0s&z7HqJ{3`zDt7Oj z5a{*~X)`TnH+@=u&P0?RlTra-EtEhC1FqF?#Fb%f1Kse${^TZl>0B15?NqjOgI}72|7v|`|6bUqG<-*9xqIVwL@u6H+8o6>lM#@xtc3cA59c7>FB{T7D(ZA+k%sw{&m$?WgG!|C1L4A(=G?)4{H@SHg#@_IQZhs87npmccUd4L%3ksl)jF&E8FAQxiPG zrFb#a@Ybj4YrLEJIRe^t9v64NZOOvEy;Dp6?v`Iph72B#>Z^pn=9=>9-8f-Q(C0*G zihFxPb~=wU;eQQu&=dYT2L4U`m5b~b_V=Uw8}W=T@hM4)OHNCvY@nA$pjP$3LWtsZ%v*wirM6eXV3!*Y?M+(CExE>I_No=izVULKA-Df`!K4|Vai zm@slW@|u(fqL040-lYf3Qz+@Kb!HqZvI#H2tGw5`TgS-+g@}V2nPY6#%^vINq-lH9HyfTrK|Tp+!EH4bws0v2w&buALJD0K^WAVESj`#2MNP> z@wdQ&*Rq{zQYeE~&U3##dY=}$j0|_L^YWjWZ>C!g<$R9`^ldHN)=0k?lHTvXcAsRt zR48o_Na#UNS`(_$uHVGoAXgZTf|u`l{~o_IXnJ!yXdW2I99pn1?kkPUo3A$4KO*(? zeq9UGffEFK1x0i0Q~wyq$?G?%-VQRCl98Eeq2Hpu&J9a(-ESy?N)@B(IlsK+?qozn z+UkK;H2L@AiG@`r3-VLI2U-uWZup*R*ufcZo9X(4F@vLw zvY!ax)crt89~DB$#f`bko)o*Byw16A|Nes?T{==atii5=_zO`giO@+4O^w#GSN^08 z)4x;DQnGl|rF`@s9&GIi2si57+!LG)*~JWx+AC;8b_tkCDF=sfciiD%OJqHG9uHPk zzn9|Q@H_RA6uES9C87c)50iUxX>S2^-9FDpWks2beXhiNI(Z;vpQ$! z4jfHAoDGr>f)(@}5IOR%gin&Z*Pxo{O5Dn4aa?cWwOtxQ%-WKZaO43(#6Vy zF7g5N;I?xU-%l*Z-fESMPv%ac5z|Py(&`0SSN^TsO@UzsH=NPs=^@R)&eUqBW@rAT zI)kRET}<-mMqd+6aR>xwtj@ftQSzvt<~A$%=IK60K>1ZLa36E^N8R_wmPFBmTXv{< zurKBGz*#MXfh6^xu?uJ;X?giPTk!6b9K$0&YEtiPaX0t(r8Nu`|cQnm@KG%-ZR-xC)@rkb0I==x5E8e z0;=Rz591=uq$dy)=|(eK!d`WB_Tz_YDo=igj1ee0}44t)_qC4A_l=B7^+d~X3bsAbA*fe5aZX1`7Y;?%OQD*W6G!BsqTXdO*_sa}T61L& zHbIWesOOhsobEI8nrgB`a2B?Xk~<$&1U2g+$i(VDF7z{Mm5~>fD!y>oS!M zSeUxz5J^`7)5UcZXYz>YNp-e{v- z(%I@><)N=6ujrcelU(zYZO+RT_%PUdOFh41X>hmD9I)D9;Ki6vy8q;a7K)n?*hCOz zPp73Q+-z~a>8wp65f#cEGU)0ByFAFLn&L78=aal~yx1rdCSXS5|CkZ!BrpOmOA79G z3o#-f{jYWDN(A%5lS+_M&sQF>UAEZ?wAPYEAQ5?Mi`SM@xE#2Bm-Fcpz?$Ah{vLwAZWaqXT>-9=navT` zV7Yu3NZvWi{wsh$3K)?ZbOyXj;!QV!ldvGFmmC5$MO*%9(=}Ya< zMnD109AhZ$*Sd-_OCQ(AYDw07d<9se=k4XenQeY-ywrTbGe2fhXsgLN6E=&L@Zh6E zM@5R*ViB)~P`Z4QoO)qYN4QPgYb)2%y*Cm?BPln8#gwGw!`@IeA?e&eK>di*+Q~k< z1r2x{47UGpMu$PRAEcVXOdpJ9_XcQNOBt-Zb;#-ahqSzJnu z8j)fV%~P4N6#fcvfXnaFCsD&0Yu>eKqb6}*&+p%KfVVg@oE0+dmr%4l_wat}boGiT zz$adb)6y<_!bvZogb2a={n_E89OeQXPQ{L|$7V1UxTzMk4|9(N2h*)Wo)Y}`H~AQkMttp?J_E!$`&@xBBd{qetI z*F$e5u*+nPw$~x-Llm~kuK6N+Kz&$+YH|e&BZ~|_t(aDT2g@dnGH#;b%Fo*89MI!m{O`d zkg%f6{4otoS<6waGIw*L)3WnLs`)eO|GX)%wj`5mTNm0mz=wmxJv5rpQR&QD-Epm- znN*sgQvC+8`+!9mP4(vAk~;qx8eD_VSNuRejv@zgal!;J;vbVskEs=UItQ4B7t!by zEtyFBq*WhN%hg0H=k4%K3?zA8KM67*Z2D9HT*Wqxz?O#(e(xo@-hjNPiv3nn_=uW0 zZ-Q-{BZ=C%%_B!*I5-KsH14v3_h~%opMZ%*L{DQ~mQ8*PRa4r{u?hwvH0h4o*`hMb zn!2Fv7hj}`^hO4*7yhj-8dg1xf~P5QbSl<4W+SSA{dxcSfB#-oNnw8#3@AyBBWq2C z)<)oIfohnl@de8hE25KY@h38m$msW%{{(wiHpJcpx`Cu~(W1SM3elT4QQ@%kp!%3h zESc-y&W<6#>MUfKRn`U)c-{sX}tlr3^#z@;tykQgyH-hs8^;gYJe-&C9IBbzDz*Ox( zp+nMgoSUn*LgGu_n}}EEP#Xc}CJfx=p5LP$^=ljTNt=8I3q(#!bfK}60165?q-k*V zG~oNGY~wCM`XC-5sYDIsm(FO^?+3cRCo-g>9tddG{AIO`{RcsyO}D9VR}7ff5GcJ+ zaS@u%W(N_>thA&@1V(r@lywQfBILegW&a4A^b1MY_zjzSLl-~#>G#yW)3o!)^WB6u zH4Gh8y7qs9d%^oB-Ds4gell5?*&<2?LSx)-^I%rKp{dm)CxU#Z?`xT5MF>8Xo}8lE zqO2Thl7rKUg6xcSI_mWb_LeIH=ZSg5Q)_YV5!^o1)6rfFgED64!QVZ>w$Gvn%_n~P z-X!tM<5b9 zDhI(d=5vi07Xo|jbnS+b@(S+0$A2@Pv|5er8><63A1LSO2g9YqLOlYLX#727{pRL) zOm;-R3sKUxBT)?Pntl&??OEm+41+TFP|Aa{GF_5)ZpSUCukd;+CLD96Rf9ixdDCK3 zr?IwPCl#Q59s#i&(T13?rx~~2{F2A-(HV~d=E9?U_rQ>0#0O@z9p&3_w~3MFqggeN zk^C|NdIo2f5o;;jdE>|QOZV^7HGI-E^T*bctEO04x69X0B7pT~*SzbXo>glg#o@*n zNo6ozChkZ0A+*y%vAVoG|~GB|DEqU z>7?@3vRfqrPn!Af9_=Q|Wz6Db^TIzzCml6m!@V{LeBojsoSV?N3-m!xL)4yn+n%@2 zI!Z&?Mo-_=1tdIUg~s1Ao`rXoII;T(ACS>Zi^r@=F5x$QYazVTDJc3KtEN$CZD$D^ z;y&q{u{BLv{UdlXp=Yht!p!D%se0v6j~S(NYrk5<#Zt(r9EIr)M7`O}+eS@rAvCb} z{F~APQOAX&4amLn7vF30m@X$QFODE9xB*l+{(O3*kE)9I-E3ENLU$U(1RD&pay=&q z&F7_@K6w4U*7zvPm}7j-Qly4q08)pj!JWymjz;gpL+nXmP*d6@Y`Jef`> zh4ABIinInevNJog2&|f=j^~X8_saXE>GNqF8#5e*jM1dM>dwM9itOf-?fFW10k;Pq zlR6B2c$fV$1Op;upKDT88I(Tr{hj=Jv(%XG9jES{$UbTkt+kld!H0Cf3t~R2#Cn^0Q_GV}@bDQ>VC)aB!VB%~-Kg_n|LISkK_rOD~ z5;F-$l|PMG-MrrBh0Xt!h+HmwCtYcmQP_!y?VI#xIip?v(V@rYImIm1^{jNYbtbdq z`&%J| z(MJ&+Hq~i1=Y{B>xj*W#io3HNS0ld7a*AD&#AR4b?8nPyAJsoRU+Mm>vFzjsNdV?r z>GU(j>3YJSZ4FX$F9_$RpL@62%)<~;m6qUhsI0`0kG(ldv0-v&@8h)#i1NTtL!&}T zJcfKv_iT9BJv9QRqe>j`9(=wd;$Y~r1CaRZ-X6{`hL3HDLHN=<%Z_^pmHl+e$_jhU zDaTX_|;%lHsPiJP4C)y_5;~7$b$&T>-mi106@Pqizbi6Nb;`Fd6-vE z;(azy3pud)kL&+G$gT8#lY`SFaB?Ncv*2vWc){|%2DYI4Q#?j1UU(fwr$9JYnmkM? zp4M8!KDkDz0ErfkdNL>Iwr-pq*otvv zS`GG#^-Uz^R)}MMh>*PpdN^~l9q2L<&V?+bdONfGzeUICWwGHV)7=sE`=tzu4W@EO zQw4iQ#)D6AO0*lXCS@iLUTyR;1<#1uk%i00P5@bJ{0 zgEelTr;0;s%Q?Cq&GsyaC%iG%(IYtKfjkH4wGcHD?!Q=n zkE?_l>l@-uex#Y>;oR-H{#WpiR`hNHe@0_%*M_2U59F|5y<4A#fz0K6G zu}740MwDuH*a+AiG49hZLGoV)hWfY;z1^JUBo;SwIiW>n;bR+3yxPW1sz^)IIp66S zZoFAcI6rM~1x>ZZR~Mgq@-Y_vR5E`C^4FrS4BV)2UINE@EbBEMpZJr$1JQW=Uu=Kp zC36<39k<|`PK7Al8PYwbWWPJYBZoGUe5iKHV&{;YLf3)E`qXX&uQt;#>!>sy43}%hpDCfAA{x|_1ceH%L zY+C#5)1nvW(=*2rzKvKvGdOK_>P$_EjvRS(aYANJW?~fRuPs5>$_p9zMz%1UbRZ!$ zkwMc%IPylAYU5JWN4eqo_t^@mc8h;M^U4{iEgdaPuoB-;ZoaLvx)l2s`d zJ}X|}S8^7r`dID2iF~-i?rkFhzgnkonal5!chIbj&h*vZR*YeSJ*ZLpa?(HRFAxw= zBl+NFKQliI+s=kWH+>5ozf1IB|_`8pFNy^lfbxt{!mMrXjAAvj)@CeJy!W>jBl-KAQEjQr?uP@&`ep3i#A=s~SsxU=gnScm(gt+%V@=4jQl58XO@461y{uZw z-Q#7FBIi{`Yj+FrnIi9zkP3ntE$R&stW??)44ax==LrIn`spZ z8ifylB0ft4yCEM|8Ru!wO8qi$vZ5Mk3Ya`32oak`14ActPshde#<=a%>K+o@#gZlI z^(|fnaJcMQtM6Trr8N82)n1XoUMx#^aWPtEFRpfEM3^y)3>FTmTbR3^Mp9D@ysn}9 zdrw#7h*^Oxfy9$M$O)&~dmk6EYj+TkMnv#(pQc482cn7vRn}#?f0Bh>1pv=PeDT1 zvzQn*BQa~&Sj#cTY$}lDfPL28SczouwGw<4o|Dk6+RRUxZ+*1st5Yj-tGgzP2OYCH zj^jCCR+~Vz++H@ae$631EP5=Bzm`Ta(zP>4`mTGSJXZ!|vVh41S#4Xdag|jQ5+Pq` zTs%RVlE7M@trk94BC?f5=@7nFtFP{tWeUyPz>ar$QHsemL2C7V~cG{W3xo@*)yBt-7DG&%mFs z-_*M`h*LsgR-w^ORH(OT>*F|+&=u>!(lAxfp~!we%v?~T&Q>|arLikoppEk`V|H;ou0`+#4UCs;t>6jt{ zpjyb@T62o)s=9#XDXG(Sr;N&KxO_N7XNek%9l&8{Lue7uzoH({B zb>QGzL1D;rq~i(MgywQF>Av1)268(=z|2*LNh0Y;2*B(HQbvA0Q1 zi5#m*X{BBorQ>sIma(`v<&kG4|1_fv_`GO|t-gHl%uJO10aBzia@nv%KV6iw3IV0B+p&3TCf)4uee3=wb+t%FT(G+++-0Cnc3cbO+9U=ZZR0p)k0kr zM`j?Qqs_YWEZYjrhI|Z(6?Iv(o_2KFrIyiOl(sM{B zmyPP=E9fLEZKW4wxxJ{^%XX!8Ok?yKyTh>O*tSw+yX>NLI#|gcpTC+u@-!t$T2UkT zK_u85E1<`s+;NW@md5BdCLd&krV7yh86l#B%pI5;QmP`Hc0?ln0$zOj$ji%Lm!i@N z9(~%HJv_K06*=8VTUe&@BI|^PJ|?AmY1grZGJ{2ZA}8#mQ$Wy)z$UvJcyUqjXxDAd z`f2d@f+7zm^Dkt*V(eo?kf)JqXGe6f ziHLj3Z)=?G6{S}wO9PAbF(EZbd`J?N_0ATjlM%DE= zp>{#TU)EGycQ>Th(8|Qk9yu6~<6SS`-P==!DmR8E@yi>R0HBW}?}G79kq;4>i2*8WZ-ZDJ8? zlU}2bR7AuQyh!F{Q#E7AAe|;9h**7uXf+*7YA6DcwA_33f{)g0DsIzq3R*7~!7BF_ zj|SFVIuNqw)KE+%%BZDn^0IFDWUcvo=Gr@xNLQnDOuiv#qrZE|-O&k|MB*{y+}!M( zNSQ-_=f8GsSjQs#08p--n**^7=$GDg?Qu&8V2Ws7(Nzlc3Y+!Dg8%Q z7Ma!{DoZ)TB`k6VjI6*;0rx4ixvcsU4|fe$;#UO0_iP9v6b}xD(?gCUOV?DyyA~`R z6n0B%O!w|ln|~Gnb`yJvv_9dKrowRe72XdghA)wgy4{5+p^rte4iq6?dZyS^+KEhH1h76k>>ZjXKtopV4yzXBC|&CZ$DA4i>fiFkU4 zlr!=k?ItHpp~>K6KGn0W^H( z)QXj93l$okX_drByrq;uBPT0e0~$1<>-m-mPmxTOs!QR-3DvFLNdUsPH5x45kowC} z@103&26#=#u8EJKO11Z8`l%QsDT?~yp;XPmRtJ^Mf}ljNJ#o!u)IiDUOdS9MGCD`i zkjy^4d}_EUH4VULXIc;Y<~PHybnOcWDAQ0i`NSFJ_Xr075VZ;(W*a-*eg?CvnE(ow zTlFQSbt653du_MBhh(D6*otlUNi6c^UGP&VPL}-+N)pt@c|ekrA3~oe{fyACEVFZH zdLtP<<7Q}^SzMYG9=H4^{7_^^f|iSZ%zRoqcO@vWysZB0$g9gt^wIZm>8N*%5(b;_ zH*K4f-2E()-w4rN?Ex>OwS5Wkwmh0iKGhJ88uzn^Ewa=PuJ?X<-ZXciM_uww22Qr! zfI_jE#iDFR6B8L#gf@kmsRYZQpu80TsUA|(J-0Sc=5?VrQw8SyUB&;~_tdo{J6PQH z9+Dy+%qz5`r^_$Yl2!cbqb0I*@@z=#uEkEh#4DH7YUY_ARZ{)0fCQ7NBGCrx20FPli zuUgpC*-KYa0$#|QN(Xqzv4X|@lx?zKtr>qI^m5rfyFnZd6CcS%Ti?SBVODTgG`y-s z&ve`$N!K3n3)=(<)Iw|No&9*KfggzDQ^;^_zEamOiBytkDvr?vrv82wXFq9aPl}nV zWeL!|lQno`8y!V+2`-+C)s!t4;H7NLe-AhD*l2A{9+M6|8J9G`swx==2fYOo{oC(!9>;H*PnetJ!3+M+80P9iKCTCbHv5cN?sm8JSOHUdQ8@mtwTVPMeo0 zpYB6dW4)va5+bG?fR;m?*z3y0$)_SF?yEhon#jFR_GvXPoC%TadCsAJxBv&UH|rd z_*0x;P*?o}MpF?VW#nYA$78N9Wf7%6Dn_(gDd+FI^Dh#4Of*I{-X%VBmO08zABYmz ze1zViE*a#xE_tyCM}ccq+VSV_Jf?ZJyZibP9+;Nx(Qcl&)5Oo{vY^b&5MfLM0MB5g ziI1O>^vmbT+%2E1@Xi^rf?FP)f; z3=1y5)xGfM8MWy3qG7lJ1rU`=!L^&%-x-z9Q{lh%I}qc>a}=Z)iE@F0z85|b>0?#TXh#TtfJHIZE z{CZ`D4RmVcpRg=m7o5ntsJ7MBfX%8D8(SI04ab!1sWOP{=@~%ZlhI_;ezWo)cn7@4 zS`+JsF@DdUURkj#7~B2TxqnNR|374Q<@UUcf7f`Qaqkxq3*T9=GWZ8vmke}32T+8T zSI9u&2Kj$wh;2ZO00756LW^6tK{dhu;`Cxpb6M~}5RSQ`zx*ECJljR>=p;AA_L*OkeU?_;)aIDE*WJ7cu_DoE7o#FeD zPRq}q*)D;Y74WE89LVr@9YaAn{D;@SLgoeF?CPBA9n|Dk=3l`Tw$UDCfve2N%}s8@^AH zf-kU@z~g=8mO)>6b7=@W%4Ck z9<^*qf{GZHxXc*4F*h?Y6b)|UGIw5?rfgpQ!1gv2Xcn7zL%INFfv#F_Yr|vx4U5~y zhZ7XK4;QF!el3#{77~sOHX!^5vLu4WCqdw~0kGWbySDlw>9~|ZAv%BfH9_|fP)h){ zm9MVcLeyAJp1q`#kU3fYUA_9wio3CF@lt&z;d(?Db388r#2cF zmJrFdGIl3b>51-tiUKex8$vhE^C%JLonnvx2d9{bC=PBDdnm{m;dzpt{Osp>ZYin6 zR2`qC%_~T1i1vS8tOU(_Y^`h9FI2wt^a?2HCpE$woh}3xg$ph1P*Cv=?`7x^G>WE1 zl=c3v_P#T!>8;sYj|Bl01q4xwln{zkkrG6dqS8VMMS8DFkq`(?EEEByS0SKMVn7Vi zO8})BdY4cIqy?l`Y4;DJC+E3$-F2S(u6M1w);%BcA<3?@XMQuY_w3o+dJkPIq!$h6|URoQtYkYX3qLSa~L1J@+CKV@#v)N@D8C_GgOMJq?Y;HF6n`SrA zEbze?f436@1SPRE73W^!yzI|O^*p;Aw{(R?|q*4fG3sAu}a?QMSLsklze^j{V_zY|Y+4=2h0!iexy$+eMN{mE`4 zJ1C4-zPPfP<-YrbR3LSM?5=AW90&m`(Ij1rvl`&~LwqDX|0lLfh0>Oi&gKq^%N2+Q z-s96fN&XNHQJ*2DE<55)59#1;5#|1}fd4k%|1*I9-xSctrbDlSQec3n`!=e&=oSiJ zkyrd!&~1;=qZht1+pdYTPm51575RN zSw{8S6FUXP*v*PM0@7>>PJTOJx5?80-pqdL6Xg#e@|-tA`q{2radL=qpEO7Ze_lB1)~b^72WzamjXf7fA;kcu)q9Q1 zXy=g9Y{~_%9@GoQ68v?`T$=}^^Wp=xHphg82Lx~WU%WCka=nw`1zuSByoI(-aA#i9 zoGzK!I5;pei<`cEx-1~j@&_ybs))hUt8$lO>4xxkB^(&so4D` zcTgIJ8wGFZ2m{1Xt55^w&4PWtI&Q1oUZfjK=8gaIJDl5mnX+1zMl?!$}D= zCv5tyoXbn!Uv?;TX8UHqYWN3xATB!WczTiKxo7ur0j!wF{F2h>kK?!iz(;dKKS@D+ zS=6Ra(4M$8l_gn~n^Vjm^Xv%Cuden~104uKak2Vn4)fY;rq-kIQ^w2|6W5!GIX6#; zA#zecSl?SaOX&66T)~qDD`y^910isZ2sB|J4LS=wmMfUgx)uf@8;<775wnJu zlqH(`O3Q^SRzD6T#R4V&PKEc=$#XFu$KO#?qlzA=UOMiqc<27N)i480WMSuJ>WdMl zst?III-h@hWW>sKm`c;;09#%GA;D(TMnsM!%I#K``gLH8M2m@=&?|&X%1{nPz_znc zug>xn5v&>D-fgb$YN|3JT3}}#K-m(G#=uhsY)mu~S;s{szlhNBU2!%Jq&{jvCZqwjDt;vPw(zYLxp1GB%pK*-s5h4;3PYh7rfj1Jrh&&N(tT=nn_=vmI!x0YI_q>%hS3 zF0^b-@13hcieTu*VqQzV)VLdUhBagFFS0AQ3B`WHfP`6qhg28(%qZpp_p@n2fi39w-PU63v~JLCe~tbHRck2_zj^wS8G12S6l zC9k=ayKMK(Q#K{{9-a7XA=N*xAi78n(rkm%@kJn$yr+3Wg6?I|by) zfe!S~I)WDbsU3M2#`ih+71hcEqDX1Ml33{z;{*n?+E7${!ozw?p9SMwL zm*=Mi_o!U{pCX42{Y|>y4K5fl#YRL#Y!HpCp|IUrfSV|bQK5dlguEso{mvYN#jDNj~>3rDJ1s3u; zz};m&4Rk91FdKP|ijij6YvxraqUK9U&G4qHlH1%(-!V6C&zXcpSJ&u9TtZMpa!Cli z*`y6nNuPaxU~T`P!@!*BjkV2^7tJe!3n{PE966`4e4i*{eXG9pwXDx#uJPvpPIBS4 zP-fYdqmX9)QdB9crjFQ`2}~q&@zq1{ep&&9aA|+ngpe}HkiPLlGFq+ANpxWKz&CWk zvc((WNoTykM+po8(cZv~VZl!^{1C(i$PE5tmwmZI6CJzpnTf<+cCs_i9{0nNL14wh}K1PHMuBtd27)tcl;fxOn_)-h)xW2@E+cFUvA>(BV+E>AuXn zEw+URK`*Zy!yzE|Hu4Y(SYm&5y}-;}In@N(DpP72CX4mB{_5FJ$#e=1U)c)zfs3Yo zhQ2he%i^+ymT%~IC(qt`4&x=h-wHlD#M3?gq^91(fD=nT1q&*dkkp~BwFK^ORNY%( z`gmx%NCFfF!jE#2nTHgkgpMs@N)KG?Jqd}LsZrqtB3Go2e0Jjr3zXhE92At9Xj2;~ zs;so4rkWWgXD!sMd+jx$jz-$EKNT8O`uV8pa1N_yxI&8Z74R@cBn~MX)SkpHT`b{=r zaA3A;O(`Cm~<8GH%DIH5Vb&&SqPLQoc~O4b3-SdYkF|v1+w31!a?-Q zi%#x-Q!%*912Vce#}JNRVN0^asI8i#G!Y4hr%Ys=mqI=-)EOF@P6*@>fZWT{*A386 zPCuc;FX$(a!r5rL-^#D%6~Y!xi@w#+mc}QUAD^4$BuZph%|lOuS&m0s z&6K-7W0VlLRFwubE$GT80lDp0affP}Is0i|;ljA|wuV=~3i0;di$7XD?x4Q>bwH)x zNtjmP4;px)=I8VywAgssZNQRyNI-kIC#nSK+to7mqR*JacRt;&NGM{E&8u>t-=_7I zGX(Zo!(i`cb4UmUJVyumkb5#x+qaCd(e7*3NMdmca-z-Gu;GLp$4+Tr-_r$Z2EZ4||{++`*CcDOJVm8wp=c-IRPU<7Zxa zzh(ahGbG_d0L<^MAxaX=ngP3>d8xIbBH~pXS3UlDqpX-Z(Wnh=Su<#ik_DLnPtWY> zzyox5xdNn`G5R*S!yrY@w@OeBsU9=qu?ap0#yDkc#(w!_fd2$RB{vxQqt-(OdI=15 zk-DatYE2NkdJu1X5cbbW(|`usw>tyd6AnCtRv;Y+*8&|7&8qb$*9KBM?rE$4b48}| zJEy)^Hby-FIR-RVWVQ>U^KPoP#D_-+r_}33wdNFB$SKYyerz7Jvq`xZT z<*H81aMuHFb_x|@=TrGxP?_aUj9udWgM29Gxekb(EgvEZvaA+YzYRqx!YxNc_~A!? z%A3hYz|yYP_Ygs}6JA7YW+XuH@MrJhYCYfbYw>EThctV*yc<T&K?aaqYNyf4*O}q{#@-CW@VJc*O6+W z@w``Yn8*&!)Od>QW!a%(sF8E3=rD(32CtOy>L?=fN1%HqCt6Mkl}HUjEb>LjIv>H3 zf-At(%Xf*ixYN*DSs)zptN?21rJpM537Eh5akxQSkmn!zp++U~5Pa7ta986m>=eVS zgnEKrMXn(zsznB?vaOF`a$`;oT)t1FJz3ltl;K_7RHF`5#i%p5vUR^8!G|cM(qcN) zx!fX$1FP$WiR`;RGCsmJpn^(d+f*+>Ngd=S12pO$ubX3Fg2#i9ekeDFmjZ}IjSQ3A zY#?LYwo2@rMaUw$8; zRQ`wJ8NTOu#pen=bJ9V=XQ`j1-Wwo+GWce0Jw{ z2jo=fgd%BB5jfID^vFF+Xuyz_U{Yp04_bBsLtTjUq>?A3_;vqB0n#5o1bgpSk0q_% zQzJDHNCUV znY10cJOW}hIzKXY7Ws)o>(9o}^J6F)R0=PI%yFvm-OSdPw&ofz7cKT#tS{2^S!@6@ z{8_Bx*P2KUl95(MJ&8ow4!P`|{j3x>e~;yknwQ62s}7dKX;xEm&QmM;lT5)>v9=J! zUOv@tz@p5+6?NmR0QG{#S%J8ZN?Z&i7V5EvzPhn!*j&>Osg)&tg@+`gKl_4DQKTOEjuwhP;-7RxygrNN~iG z2FbeOPBLQ?_u?ElkQpP5R~~0l-M~37LtG*m?H8`B+a?M!z8#+Vm^cRItZ17GGt3p#|*jc8DzC z`6z59S`Q|C==B}c6ObY?wJPNmLF=*A>#-y#N<%|fvD;6CS?yppkJM73pMcbX!a^_T z>&00$tK(|<@Np3LG~zjlk{Z78U;5T-tA0gGkFSVHcM$oG>bR z@iT<1-bk!9^xW9z!EbM(K`CRqlsx3ls*27K7o7)Y0z-sbgUWb(gq>zG0|>sNF+@;! zjfqT3aSzl0>{^Cg&oj&ZC(){9l+Ei+Cz@mjWOtjhNT~rFlU)?5I_uM)B0d~~5+QWF z0ix*CC-c1sF|q+SP3fkb>gbPeP=7EX(%N8p`c~4Q=^x;CD-H6|^n?T_Ha@tM%oXg; zFMEBTz(A9^JdA;f5DC*4G~oFAAQ9Uj6lChoX9cI3{H?YK1-oT7pRm-R*_#29pN0uF zv72#&iMYo5N>V3jY;DVPV{qYjkK{sa79+T)O((x@{$lw3HYqIikc6V^y56 zeu&z;i{cly^A2S1Oi(BHU0@%OlclXV6lQVDoH~*}9qJ(6UFC#{+)ZScL@eGiv+pgu z(^3co8HxE!0Wm@lvA~G3hBn}?+@WIZ=N)+PauuQYlhzMwQr43LF4xgK!^zctQ}hUl ztQ1FopFB|;3(y#P*Zd`j3$aMeR;Wu416w0^z7qpJeTS6JGzyKn!MS#mAWFw1!1HQ| z9>>41h8gi&E!W}4VKw6xVjkfkk;);Xv9EyXIUAG&6v4n^N;w4fwSwvI!I9PRCe3sg z+Ba5Z1ULJV|I&yekt108E;$FMLj;>HdLBrlw5e9l#l83@aTX1AwThA`yya$t3Oc7s z{MOX=-f!gPppBX06%ICQnvb6Ap-`GmPsla z|Gu35pJ-xd_j_MUFo*Twx?yR%*~6g2f$DT43I^~8Q1_=b_kb`ChFZix3lJH50SS#5=!Pb^57M<4(lD-x_5vPXMeUyl z!!uylyYL0jz2nSB5?$QML<@cNUZ(kviNq*XL{1iLFKfa7v>Sn9h~c25M6p{W(-yTW z{9!xtFctxKM==qy1HOn+S(9bX4L{ADaJT+FC@?PrOtd=#cF|#Ex8R3IM0C_b-EAE= z)n49P*vLKsPHTWsjR#Jzg~; zC8}zGBMdwCv4APTwhl`YnTfw>T;7+_aaYRhu&r#Z;;X4rg*6?+Q&UNUTUwhF`5mHK zpeas6I$PD!#5*G$0Tp)5{fPMdjP|HkJ6bQVNROqbPoXJg2b4t6cDsq=ukoP-+kmSr zsB3BDrX6dkW$ROhB5q=3i|Z3pN4kyfJ-9cgI{cfS1KKFgbLNw@x&!BZ5>wL%Rkdd# z*Ftu5kd+JHzQBKEN1#M?7pJ97r?KsPyP#LYY&wp1dQpqK49%kM%0HrJ@~TD_@X^>Z zaOB046VEH=qeXKR68egw*U&D%Im;mv8Yw8WRdm6|9~#9Js^rnCVMu%^#&cB@y^z;o zsY5NA=#`dX;r>?2y%CZrcGbInM)8LZ9xAybUQy{fwSe=xYHefuc2^cTHNQT+FjR*K z4b{!a-=fR1AKT{s&8Ov~Z=nl^ZQb$Af|BK}F122{8d|jtlij<*ZA2B7i;XDqP%&`pw}(aBVaaxBa0A0xC2LkHT$(7HOY7x2+79qr5aN2iZ(FDP*9$M3VQ z0DuCHSrFZF^fDIP`kSjT&@>q8$M0#wP>N{C%KsBm{5?RL1L>d}Ji*{Khn%Ozl@86S z7pCrcD`0o#e~b@reSMl6*^#-ipuhHQ@dtk9Ymqf%;tD@b$BpH(@$=t;sFMRJYlP&k z>h)oV@ikL+leTN^v%EV)E^qcx?|25Re6L8Nj_q2#7JBtdb@S|dlIr=yPXB~eH={S# zT`h*qw-(yE1|oBV$<|8ilKU%Wd@MPM7uDdsnZX^iPVN393?5Yr$oU<4WV8uuQME=# zFgA`^q7*6OJ%2Z)x1s#L>c#vP)KHC+e~(!+wsh&FbnTMM;ukyP5wq3fwT4s^p)ZTB z6wl0tQ;IqFNze}lS#^9-9PHY9(pMAvj4IfApW?58C7LTBinzv{*X=#t<{!%H`dqr8 ziHnbOpd+EeZWrg;%NcT#Db?L0;ZASI>0%DuC4Gr)H=Hzm1SIR<14@6hqBhHJVsxmG0Pjv(&ap&+?m(hxPgX#O+mHp!9e(AUwyf3s(?_eydA6a(ipFMk_|wS7^`mMw4q+Z=hC7I zHX5=1>k-;$d{qwYR72>RSOiO`n@}c{*R9Nzut230%`>FfsP}T8GyC2`Hwv`TtA$o2 zJMGo6@1Lf==J?KTOLbvM<=XX)?@2<_&lxP!Dnx=+S@?4#;IW!H!7QST#tmWvOe+)d zor&qeE2;{!W}q>V7N*LkxGv5KMx*zopYWEUp>r&BEV&+pDgCx2QKP}Qypis~n6%$? z0Dy$(sptX#iEpR~Rhl=)K9FeGau}wNtCrcu>SjVsRZvsUr+_3WC+l>UR5qQd{CVTY zQ`2Wenz40~7ec7;?+iVggXe>?W}0Sv^@CND(+hXx(S_Y*M>1oX#Egst4V1kaO_1}j z!7jml6<(l@TFlZyV<(f6%$4_vdTh@M;f^P2{w%O>p}>T@Febls%-^$KhNz^k<5D1m z7(XiI^&ShBRN}sfyj4QfuZ^mn9`cfIp|C2v>}66}$GLe)D|wIhHw*!hH}-yr{QhG; zq0U`bP~Twvt?$3q*Xxr=Hnw4annfr?q19jzLm8m&l#Y_88Yw4APZcwdy? zi=&c^{h&e)RZI9jX(NIM1mtWn!bIIr^|Fb9zwR&}LJsy>4f_YKIJnTMG zbf!Ucr!>*t``ixnjt8Lq`@Q{jKwzNSkS<%}UlYy&6yN4P|Nhg}e@DQz@xnP@OS11a z`|5jtnf_PPzUv)yOzN?uX6T8=OuHNf4#!I_;1*3_?yc9pI*1EGFcJJiBQzI90OnPH^en5I+_tX74 zi7wmQl*XC!V|#u_5RVR8VCUj4Q|p@aHQc87Zi|&<0R~!ncb&&iKyzKsPk^;c@Twh+PoiyF)vW(ZRSIS zoz&Ks;oD*XVnfni-&I{xJ-4$YO$j$8ll-u%>(qYOy_dWgpPFN!j1`hEX32nWkMuFn zD2+YG=aF*;P1?Tv0BTl_dmp~I&=7US{qWEF|HZ`=nNIJLVR+;O+ZrPGL)I1Kp>pqK HOrHD))^@>| diff --git a/arrow-data-source/script/build_arrow.sh b/arrow-data-source/script/build_arrow.sh deleted file mode 100755 index 410e310709a2..000000000000 --- a/arrow-data-source/script/build_arrow.sh +++ /dev/null @@ -1,106 +0,0 @@ -#!/bin/bash - -set -eu - -NPROC=$(nproc) - -TESTS=OFF -BUILD_ARROW=OFF -STATIC_ARROW=OFF -ARROW_ROOT=/usr/local - -for arg in "$@" -do - case $arg in - -t=*|--tests=*) - TESTS=("${arg#*=}") - shift # Remove argument name from processing - ;; - -a=*|--build_arrow=*) - BUILD_ARROW=("${arg#*=}") - shift # Remove argument name from processing - ;; - -s=*|--static_arrow=*) - STATIC_ARROW=("${arg#*=}") - shift # Remove argument name from processing - ;; - -ar=*|--arrow_root=*) - ARROW_ROOT=("${arg#*=}") - shift # Remove argument name from processing - ;; - *) - OTHER_ARGUMENTS+=("$1") - shift # Remove generic argument from processing - ;; - esac -done - -echo "CMAKE Arguments:" -echo "TESTS=${TESTS}" -echo "BUILD_ARROW=${BUILD_ARROW}" -echo "STATIC_ARROW=${STATIC_ARROW}" -echo "ARROW_ROOT=${ARROW_ROOT}" - -CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) -echo $CURRENT_DIR - -cd ${CURRENT_DIR} -if [ -d build ]; then - rm -r build -fi - -if [ $BUILD_ARROW == "ON" ]; then -echo "Building Arrow from Source ..." -mkdir build -cd build -ARROW_PREFIX="${CURRENT_DIR}/build" # Use build directory as ARROW_PREFIX -ARROW_SOURCE_DIR="${ARROW_PREFIX}/arrow_ep" -ARROW_INSTALL_DIR="${ARROW_PREFIX}/arrow_install" - -echo "ARROW_PREFIX=${ARROW_PREFIX}" -echo "ARROW_SOURCE_DIR=${ARROW_SOURCE_DIR}" -echo "ARROW_INSTALL_DIR=${ARROW_INSTALL_DIR}" -mkdir -p $ARROW_SOURCE_DIR -mkdir -p $ARROW_INSTALL_DIR -git clone https://github.com/oap-project/arrow.git --branch arrow-4.0.0-oap $ARROW_SOURCE_DIR -pushd $ARROW_SOURCE_DIR - -cmake ./cpp \ - -DARROW_BUILD_STATIC=OFF -DARROW_BUILD_SHARED=ON -DARROW_COMPUTE=ON \ - -DARROW_S3=ON \ - -DARROW_GANDIVA_JAVA=ON \ - -DARROW_GANDIVA=ON \ - -DARROW_PARQUET=ON \ - -DARROW_HDFS=ON \ - -DARROW_BOOST_USE_SHARED=OFF \ - -DARROW_JNI=ON \ - -DARROW_DATASET=ON \ - -DARROW_WITH_PROTOBUF=ON \ - -DARROW_WITH_SNAPPY=ON \ - -DARROW_WITH_LZ4=ON \ - -DARROW_WITH_ZSTD=OFF \ - -DARROW_WITH_BROTLI=OFF \ - -DARROW_WITH_ZLIB=OFF \ - -DARROW_WITH_FASTPFOR=ON \ - -DARROW_FILESYSTEM=ON \ - -DARROW_JSON=ON \ - -DARROW_CSV=ON \ - -DARROW_FLIGHT=OFF \ - -DARROW_JEMALLOC=ON \ - -DARROW_SIMD_LEVEL=AVX2 \ - -DARROW_RUNTIME_SIMD_LEVEL=MAX \ - -DARROW_DEPENDENCY_SOURCE=BUNDLED \ - -DCMAKE_INSTALL_PREFIX=${ARROW_INSTALL_DIR} \ - -DCMAKE_INSTALL_LIBDIR=lib - -make -j$NPROC -make install - -cd java -mvn clean install -P arrow-jni -am -Darrow.cpp.build.dir=${ARROW_INSTALL_DIR}/lib -DskipTests -Dcheckstyle.skip -popd -echo "Finish to build Arrow from Source !!!" -else -echo "Use ARROW_ROOT as Arrow Library Path" -echo "ARROW_ROOT=${ARROW_ROOT}" -fi diff --git a/arrow-data-source/standard/pom.xml b/arrow-data-source/standard/pom.xml deleted file mode 100644 index fa35d662c6f8..000000000000 --- a/arrow-data-source/standard/pom.xml +++ /dev/null @@ -1,105 +0,0 @@ - - - - spark-arrow-datasource - com.intel.oap - 1.2.0-snapshot - - 4.0.0 - - spark-arrow-datasource-standard - - - - - com.intel.oap - spark-arrow-datasource-common - ${project.version} - - - - - ${project.basedir}/src/main/scala - ${project.basedir}/src/test/scala - - - org.apache.maven.plugins - maven-compiler-plugin - 3.8.0 - - 1.8 - 1.8 - - - - compile - - compile - - - - - - org.codehaus.mojo - build-helper-maven-plugin - - - add-src-1 - generate-sources - - add-source - - - - ${project.basedir}/src/main/java - - - - - - - maven-assembly-plugin - 3.3.0 - - - jar-with-dependencies - - - - - make-assembly - package - - single - - - - - - org.apache.maven.plugins - maven-source-plugin - - - attach-sources - - jar - - - - - - - - - - org.scala-tools - maven-scala-plugin - - ${scala.version} - - - - - diff --git a/arrow-data-source/standard/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/arrow-data-source/standard/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister deleted file mode 100644 index dfdfbba20034..000000000000 --- a/arrow-data-source/standard/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister +++ /dev/null @@ -1 +0,0 @@ -com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowDataSourceV2 \ No newline at end of file diff --git a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/ArrowWriteExtension.scala b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/ArrowWriteExtension.scala deleted file mode 100644 index 7f1d6e1530ca..000000000000 --- a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/ArrowWriteExtension.scala +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.oap.spark.sql - -import com.intel.oap.spark.sql.ArrowWriteExtension.ArrowWritePostRule -import com.intel.oap.spark.sql.ArrowWriteExtension.DummyRule -import com.intel.oap.spark.sql.ArrowWriteExtension.SimpleColumnarRule -import com.intel.oap.spark.sql.ArrowWriteExtension.SimpleStrategy -import com.intel.oap.spark.sql.execution.datasources.arrow.ArrowFileFormat -import com.intel.oap.sql.execution.RowToArrowColumnarExec -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{SparkSession, SparkSessionExtensions, Strategy} -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.catalyst.plans.logical.OrderPreservingUnaryNode - -import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.catalyst.util.ArrayData -import org.apache.spark.sql.catalyst.util.MapData -import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec -import org.apache.spark.sql.execution.CodegenSupport -import org.apache.spark.sql.execution.ColumnarRule -import org.apache.spark.sql.execution.ColumnarToRowExec -import org.apache.spark.sql.execution.ColumnarToRowTransition -import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.command.DataWritingCommandExec -import org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand -import org.apache.spark.sql.types.DataType -import org.apache.spark.sql.types.Decimal -import org.apache.spark.sql.vectorized.ColumnarBatch -import org.apache.spark.unsafe.types.CalendarInterval -import org.apache.spark.unsafe.types.UTF8String - -class ArrowWriteExtension extends (SparkSessionExtensions => Unit) { - def apply(e: SparkSessionExtensions): Unit = { - e.injectColumnar(session => SimpleColumnarRule(DummyRule, ArrowWritePostRule(session))) - e.injectPlannerStrategy(session => SimpleStrategy()) - } -} - -object ArrowWriteExtension { - private object DummyRule extends Rule[SparkPlan] { - def apply(p: SparkPlan): SparkPlan = p - } - - private case class SimpleColumnarRule(pre: Rule[SparkPlan], post: Rule[SparkPlan]) - extends ColumnarRule { - override def preColumnarTransitions: Rule[SparkPlan] = pre - override def postColumnarTransitions: Rule[SparkPlan] = post - } - - case class ArrowWritePostRule(session: SparkSession) extends Rule[SparkPlan] { - override def apply(plan: SparkPlan): SparkPlan = plan match { - case rc @ DataWritingCommandExec(cmd, ColumnarToRowExec(child)) => - cmd match { - case command: InsertIntoHadoopFsRelationCommand => - if (command.fileFormat - .isInstanceOf[ArrowFileFormat]) { - rc.withNewChildren(Array(ColumnarToFakeRowAdaptor(child))) - } else { - plan.withNewChildren(plan.children.map(apply)) - } - case _ => plan.withNewChildren(plan.children.map(apply)) - } - case rc @ DataWritingCommandExec(cmd, child) => - cmd match { - case command: InsertIntoHadoopFsRelationCommand => - if (command.fileFormat - .isInstanceOf[ArrowFileFormat]) { - child match { - case c: AdaptiveSparkPlanExec => - rc.withNewChildren( - Array( - AdaptiveSparkPlanExec( - ColumnarToFakeRowAdaptor(c.inputPlan), - c.context, - c.preprocessingRules, - c.isSubquery))) - case other => - rc.withNewChildren( - Array(ColumnarToFakeRowAdaptor(RowToArrowColumnarExec(child)))) - } - } else { - plan.withNewChildren(plan.children.map(apply)) - } - case _ => plan.withNewChildren(plan.children.map(apply)) - } - case plan: SparkPlan => plan.withNewChildren(plan.children.map(apply)) - } - } - - class FakeRow(val batch: ColumnarBatch) extends InternalRow { - override def numFields: Int = throw new UnsupportedOperationException() - override def setNullAt(i: Int): Unit = throw new UnsupportedOperationException() - override def update(i: Int, value: Any): Unit = throw new UnsupportedOperationException() - override def copy(): InternalRow = throw new UnsupportedOperationException() - override def isNullAt(ordinal: Int): Boolean = throw new UnsupportedOperationException() - override def getBoolean(ordinal: Int): Boolean = throw new UnsupportedOperationException() - override def getByte(ordinal: Int): Byte = throw new UnsupportedOperationException() - override def getShort(ordinal: Int): Short = throw new UnsupportedOperationException() - override def getInt(ordinal: Int): Int = throw new UnsupportedOperationException() - override def getLong(ordinal: Int): Long = throw new UnsupportedOperationException() - override def getFloat(ordinal: Int): Float = throw new UnsupportedOperationException() - override def getDouble(ordinal: Int): Double = throw new UnsupportedOperationException() - override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal = - throw new UnsupportedOperationException() - override def getUTF8String(ordinal: Int): UTF8String = - throw new UnsupportedOperationException() - override def getBinary(ordinal: Int): Array[Byte] = throw new UnsupportedOperationException() - override def getInterval(ordinal: Int): CalendarInterval = - throw new UnsupportedOperationException() - override def getStruct(ordinal: Int, numFields: Int): InternalRow = - throw new UnsupportedOperationException() - override def getArray(ordinal: Int): ArrayData = throw new UnsupportedOperationException() - override def getMap(ordinal: Int): MapData = throw new UnsupportedOperationException() - override def get(ordinal: Int, dataType: DataType): AnyRef = - throw new UnsupportedOperationException() - } - - private case class ColumnarToFakeRowLogicAdaptor(child: LogicalPlan) - extends OrderPreservingUnaryNode { - override def output: Seq[Attribute] = child.output - } - - private case class ColumnarToFakeRowAdaptor(child: SparkPlan) extends ColumnarToRowTransition { - if (!child.logicalLink.isEmpty) { - setLogicalLink(ColumnarToFakeRowLogicAdaptor(child.logicalLink.get)) - } - - override protected def doExecute(): RDD[InternalRow] = { - child.executeColumnar().map { cb => new FakeRow(cb) } - } - - override def output: Seq[Attribute] = child.output - } - - case class SimpleStrategy() extends Strategy { - override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { - case ColumnarToFakeRowLogicAdaptor(child: LogicalPlan) => - Seq(ColumnarToFakeRowAdaptor(planLater(child))) - case other => - Nil - } - } - -} diff --git a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/ArrowWriteQueue.scala b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/ArrowWriteQueue.scala deleted file mode 100644 index d03ab27b06ff..000000000000 --- a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/ArrowWriteQueue.scala +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.oap.spark.sql - -import java.lang -import java.net.URI -import java.util.Collections -import java.util.UUID -import java.util.concurrent.ArrayBlockingQueue -import java.util.concurrent.TimeUnit -import java.util.regex.Pattern - -import com.intel.oap.spark.sql.ArrowWriteQueue.EOS_BATCH -import com.intel.oap.spark.sql.ArrowWriteQueue.ScannerImpl -import org.apache.arrow.dataset.file.DatasetFileWriter -import org.apache.arrow.dataset.file.format.FileFormat -import org.apache.arrow.dataset.scanner.Scanner -import org.apache.arrow.dataset.scanner.ScanTask -import org.apache.arrow.vector.ipc.message.ArrowRecordBatch -import org.apache.arrow.vector.types.pojo.Schema - -class ArrowWriteQueue(schema: Schema, fileFormat: FileFormat, outputFileURI: String) - extends AutoCloseable { - private val scanner = new ScannerImpl(schema) - - private val writeThread = new Thread(() => { - URI.create(outputFileURI) // validate uri - val matcher = ArrowWriteQueue.TAILING_FILENAME_REGEX.matcher(outputFileURI) - if (!matcher.matches()) { - throw new IllegalArgumentException("illegal out put file uri: " + outputFileURI) - } - val dirURI = matcher.group(1) - val fileName = matcher.group(2) - - DatasetFileWriter.write(scanner, fileFormat, dirURI, Array(), 1, fileName) - }, "ArrowWriteQueue - " + UUID.randomUUID().toString) - - writeThread.start() - - def enqueue(batch: ArrowRecordBatch): Unit = { - scanner.enqueue(batch) - } - - override def close(): Unit = { - scanner.enqueue(EOS_BATCH) - writeThread.join() - } -} - -object ArrowWriteQueue { - private val TAILING_FILENAME_REGEX = Pattern.compile("^(.*)/([^/]+)$") - private val EOS_BATCH = new ArrowRecordBatch(0, Collections.emptyList(), Collections.emptyList()) - - class ScannerImpl(schema: Schema) extends Scanner { - private val writeQueue = new ArrayBlockingQueue[ArrowRecordBatch](64) - - def enqueue(batch: ArrowRecordBatch): Unit = { - writeQueue.put(batch) - } - - override def scan(): lang.Iterable[_ <: ScanTask] = { - Collections.singleton(new ScanTask { - override def execute(): ScanTask.BatchIterator = { - new ScanTask.BatchIterator { - private var currentBatch: Option[ArrowRecordBatch] = None - - override def hasNext: Boolean = { - if (currentBatch.isDefined) { - return true - } - val batch = try { - writeQueue.poll(30L, TimeUnit.MINUTES) - } catch { - case _: InterruptedException => - Thread.currentThread().interrupt() - EOS_BATCH - } - if (batch == null) { - throw new RuntimeException("ArrowWriter: Timeout waiting for data") - } - if (batch == EOS_BATCH) { - return false - } - currentBatch = Some(batch) - true - } - - override def next(): ArrowRecordBatch = { - if (currentBatch.isEmpty) { - throw new IllegalStateException() - } - try { - currentBatch.get - } finally { - currentBatch = None - } - } - - override def close(): Unit = { - - } - } - } - - override def close(): Unit = { - - } - }) - } - - override def schema(): Schema = { - schema - } - - override def close(): Unit = { - - } - } -} diff --git a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/DataFrameReaderImplicits.scala b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/DataFrameReaderImplicits.scala deleted file mode 100644 index ee3b5f291fc7..000000000000 --- a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/DataFrameReaderImplicits.scala +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.oap.spark.sql - -import org.apache.spark.sql.{DataFrame, DataFrameReader} - -class DataFrameReaderImplicits(r: DataFrameReader) { - - /** - * Loads a file via Arrow Datasets API and returns the result as a `DataFrame`. - * - * @param path input path - * @since 3.0.0-SNAPSHOT - */ - def arrow(path: String): DataFrame = { - // This method ensures that calls that explicit need single argument works, see SPARK-16009 - arrow(Seq(path): _*) - } - - /** - * Loads files via Arrow Datasets API and returns the result as a `DataFrame`. - * - * @param paths input paths - * @since 3.0.0-SNAPSHOT - */ - @scala.annotation.varargs - def arrow(paths: String*): DataFrame = r.format("arrow").load(paths: _*) -} - -object DataFrameReaderImplicits { - implicit def readerConverter(r: DataFrameReader): DataFrameReaderImplicits = { - new DataFrameReaderImplicits(r) - } -} diff --git a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/DataFrameWriterImplicits.scala b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/DataFrameWriterImplicits.scala deleted file mode 100644 index 4df6135b9468..000000000000 --- a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/DataFrameWriterImplicits.scala +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.oap.spark.sql - -import org.apache.spark.sql.DataFrameWriter - -class DataFrameWriterImplicits[T](w: DataFrameWriter[T]) { - - def arrow(path: String): Unit = { - // This method ensures that calls that explicit need single argument works, see SPARK-16009 - w.format("arrow").save(path) - } -} - -object DataFrameWriterImplicits { - implicit def writerConverter[T](w: DataFrameWriter[T]): DataFrameWriterImplicits[T] = { - new DataFrameWriterImplicits[T](w) - } -} diff --git a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowFileFormat.scala b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowFileFormat.scala deleted file mode 100644 index 47a21048e575..000000000000 --- a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowFileFormat.scala +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.oap.spark.sql.execution.datasources.arrow - -import java.net.URLDecoder - -import scala.collection.JavaConverters._ - -import com.intel.oap.spark.sql.ArrowWriteExtension.FakeRow -import com.intel.oap.spark.sql.ArrowWriteQueue -import com.intel.oap.spark.sql.execution.datasources.v2.arrow.{ArrowFilters, ArrowOptions, ArrowUtils} -import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowSQLConf._ -import com.intel.oap.vectorized.ArrowWritableColumnVector -import org.apache.arrow.dataset.scanner.ScanOptions -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileStatus, Path} -import org.apache.hadoop.mapreduce.Job -import org.apache.hadoop.mapreduce.TaskAttemptContext -import org.apache.parquet.hadoop.codec.CodecConfig - -import org.apache.spark.TaskContext -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile} -import org.apache.spark.sql.execution.datasources.OutputWriter -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils.UnsafeItr -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkVectorUtils -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.{DataSourceRegister, Filter} -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.util.CaseInsensitiveStringMap -import org.apache.spark.sql.vectorized.ColumnarBatch; - -class ArrowFileFormat extends FileFormat with DataSourceRegister with Serializable { - - - override def isSplitable(sparkSession: SparkSession, - options: Map[String, String], path: Path): Boolean = { - ArrowUtils.isOriginalFormatSplitable( - new ArrowOptions(new CaseInsensitiveStringMap(options.asJava).asScala.toMap)) - } - - def convert(files: Seq[FileStatus], options: Map[String, String]): Option[StructType] = { - ArrowUtils.readSchema(files, new CaseInsensitiveStringMap(options.asJava)) - } - - override def inferSchema(sparkSession: SparkSession, - options: Map[String, String], - files: Seq[FileStatus]): Option[StructType] = { - convert(files, options) - } - - override def prepareWrite(sparkSession: SparkSession, - job: Job, - options: Map[String, String], - dataSchema: StructType): OutputWriterFactory = { - val arrowOptions = new ArrowOptions(new CaseInsensitiveStringMap(options.asJava).asScala.toMap) - new OutputWriterFactory { - override def getFileExtension(context: TaskAttemptContext): String = { - ArrowUtils.getFormat(arrowOptions) match { - case _: org.apache.arrow.dataset.file.format.ParquetFileFormat => - CodecConfig.from(context).getCodec.getExtension + ".parquet" - case f => throw new IllegalArgumentException("Unimplemented file type to write: " + f) - } - } - - override def newInstance(path: String, dataSchema: StructType, - context: TaskAttemptContext): OutputWriter = { - val writeQueue = new ArrowWriteQueue(ArrowUtils.toArrowSchema(dataSchema), - ArrowUtils.getFormat(arrowOptions), path) - - new OutputWriter { - override def write(row: InternalRow): Unit = { - val batch = row.asInstanceOf[FakeRow].batch - writeQueue.enqueue(SparkVectorUtils - .toArrowRecordBatch(batch)) - } - - override def close(): Unit = { - writeQueue.close() - } - } - } - } - } - - override def supportBatch(sparkSession: SparkSession, dataSchema: StructType): Boolean = true - - override def buildReaderWithPartitionValues(sparkSession: SparkSession, - dataSchema: StructType, - partitionSchema: StructType, - requiredSchema: StructType, - filters: Seq[Filter], - options: Map[String, String], - hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { - val sqlConf = sparkSession.sessionState.conf; - val batchSize = sqlConf.parquetVectorizedReaderBatchSize - val enableFilterPushDown = sqlConf.arrowFilterPushDown - - (file: PartitionedFile) => { - val factory = ArrowUtils.makeArrowDiscovery( - URLDecoder.decode(file.filePath, "UTF-8"), file.start, file.length, - new ArrowOptions( - new CaseInsensitiveStringMap( - options.asJava).asScala.toMap)) - - // todo predicate validation / pushdown - val dataset = factory.finish(); - - val filter = if (enableFilterPushDown) { - ArrowFilters.translateFilters(filters) - } else { - org.apache.arrow.dataset.filter.Filter.EMPTY - } - - val scanOptions = new ScanOptions(requiredSchema.map(f => f.name).toArray, - filter, batchSize) - val scanner = dataset.newScan(scanOptions) - - val taskList = scanner - .scan() - .iterator() - .asScala - .toList - val itrList = taskList - .map(task => task.execute()) - - Option(TaskContext.get()).foreach(_.addTaskCompletionListener[Unit](_ => { - itrList.foreach(_.close()) - taskList.foreach(_.close()) - scanner.close() - dataset.close() - factory.close() - })) - - val itr = itrList - .toIterator - .flatMap(itr => itr.asScala) - .map(batch => ArrowUtils.loadBatch(batch, file.partitionValues, partitionSchema, - requiredSchema)) - new UnsafeItr(itr).asInstanceOf[Iterator[InternalRow]] - } - } - - override def vectorTypes(requiredSchema: StructType, partitionSchema: StructType, - sqlConf: SQLConf): Option[Seq[String]] = { - Option(Seq.fill(requiredSchema.fields.length + partitionSchema.fields.length)( - classOf[ArrowWritableColumnVector].getName - )) - } - - override def shortName(): String = "arrow" -} - -object ArrowFileFormat { -} diff --git a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowDataSourceV2.scala b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowDataSourceV2.scala deleted file mode 100644 index b688c06d8258..000000000000 --- a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowDataSourceV2.scala +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.oap.spark.sql.execution.datasources.v2.arrow - -import com.intel.oap.spark.sql.execution.datasources.arrow.ArrowFileFormat - -import org.apache.spark.sql.connector.catalog.Table -import org.apache.spark.sql.execution.datasources.FileFormat -import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 -import org.apache.spark.sql.util.CaseInsensitiveStringMap - -class ArrowDataSourceV2 extends FileDataSourceV2 { - - private val format = classOf[ArrowFileFormat] - - override def fallbackFileFormat: Class[_ <: FileFormat] = { - format - } - - override def getTable(options: CaseInsensitiveStringMap): Table = { - val paths = getPaths(options) - val tableName = getTableName(options, paths) - ArrowTable(tableName, sparkSession, options, paths, None, fallbackFileFormat) - } - - override def shortName(): String = "arrow" -} diff --git a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowFilters.scala b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowFilters.scala deleted file mode 100644 index f33c7995a852..000000000000 --- a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowFilters.scala +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.oap.spark.sql.execution.datasources.v2.arrow - -import org.apache.arrow.dataset.DatasetTypes -import org.apache.arrow.dataset.DatasetTypes.TreeNode -import org.apache.arrow.dataset.filter.FilterImpl - -import org.apache.spark.sql.sources._ -import org.apache.spark.sql.types.StructType - -object ArrowFilters { - def pruneWithSchema(pushedFilters: Array[Filter], schema: StructType): Seq[Filter] = { - pushedFilters.filter(pushedFilter => { - isToBeAccepted(pushedFilter, schema) - }) - } - - private def isToBeAccepted(pushedFilter: Filter, schema: StructType): Boolean = { - pushedFilter match { - case EqualTo(attribute, value) => existsIn(attribute, schema) - case GreaterThan(attribute, value) => existsIn(attribute, schema) - case GreaterThanOrEqual(attribute, value) => existsIn(attribute, schema) - case LessThan(attribute, value) => existsIn(attribute, schema) - case LessThanOrEqual(attribute, value) => existsIn(attribute, schema) - case Not(child) => isToBeAccepted(child, schema) - case And(left, right) => isToBeAccepted(left, schema) && isToBeAccepted(right, schema) - case Or(left, right) => isToBeAccepted(left, schema) && isToBeAccepted(right, schema) - case IsNotNull(attribute) => existsIn(attribute, schema) - case IsNull(attribute) => existsIn(attribute, schema) - case _ => false // fixme complete this - } - } - - private def existsIn(attr: String, schema: StructType): Boolean = { - schema.foreach(f => { - if (f.name == attr) { - return true; - } - }) - false - } - - def translateFilters(pushedFilters: Seq[Filter]): org.apache.arrow.dataset.filter.Filter = { - val node = pushedFilters - .flatMap(translateFilter) - .reduceOption((t1: TreeNode, t2: TreeNode) => { - DatasetTypes.TreeNode.newBuilder.setAndNode( - DatasetTypes.AndNode.newBuilder() - .setLeftArg(t1) - .setRightArg(t2) - .build()).build() - }) - if (node.isDefined) { - new FilterImpl(DatasetTypes.Condition.newBuilder() - .setRoot(node.get).build) - } else { - org.apache.arrow.dataset.filter.Filter.EMPTY - } - } - - private def translateValue(value: Any): Option[TreeNode] = { - value match { - case v: Integer => Some( - DatasetTypes.TreeNode.newBuilder.setIntNode( - DatasetTypes.IntNode.newBuilder.setValue(v).build) - .build) - case v: Long => Some( - DatasetTypes.TreeNode.newBuilder.setLongNode( - DatasetTypes.LongNode.newBuilder.setValue(v).build) - .build) - case v: Float => Some( - DatasetTypes.TreeNode.newBuilder.setFloatNode( - DatasetTypes.FloatNode.newBuilder.setValue(v).build) - .build) - case v: Double => Some( - DatasetTypes.TreeNode.newBuilder.setDoubleNode( - DatasetTypes.DoubleNode.newBuilder.setValue(v).build) - .build) - case v: Boolean => Some( - DatasetTypes.TreeNode.newBuilder.setBooleanNode( - DatasetTypes.BooleanNode.newBuilder.setValue(v).build) - .build) - case _ => None // fixme complete this - } - } - - private def translateFilter(pushedFilter: Filter): Option[TreeNode] = { - pushedFilter match { - case EqualTo(attribute, value) => - createComparisonNode("equal", attribute, value) - case GreaterThan(attribute, value) => - createComparisonNode("greater", attribute, value) - case GreaterThanOrEqual(attribute, value) => - createComparisonNode("greater_equal", attribute, value) - case LessThan(attribute, value) => - createComparisonNode("less", attribute, value) - case LessThanOrEqual(attribute, value) => - createComparisonNode("less_equal", attribute, value) - case Not(child) => - createNotNode(child) - case And(left, right) => - createAndNode(left, right) - case Or(left, right) => - createOrNode(left, right) - case IsNotNull(attribute) => - createIsNotNullNode(attribute) - case IsNull(attribute) => - createIsNullNode(attribute) - case _ => None // fixme complete this - } - } - - private def createComparisonNode(opName: String, - attribute: String, value: Any): Option[TreeNode] = { - val translatedValue = translateValue(value) - translatedValue match { - case Some(v) => Some( - DatasetTypes.TreeNode.newBuilder.setCpNode( - DatasetTypes.ComparisonNode.newBuilder - .setOpName(opName) // todo make op names enumerable - .setLeftArg( - DatasetTypes.TreeNode.newBuilder.setFieldNode( - DatasetTypes.FieldNode.newBuilder.setName(attribute).build) - .build) - .setRightArg(v) - .build) - .build) - case None => None - } - } - - def createNotNode(child: Filter): Option[TreeNode] = { - val translatedChild = translateFilter(child) - if (translatedChild.isEmpty) { - return None - } - Some(DatasetTypes.TreeNode.newBuilder - .setNotNode(DatasetTypes.NotNode.newBuilder.setArgs(translatedChild.get).build()).build()) - } - - def createIsNotNullNode(attribute: String): Option[TreeNode] = { - Some(DatasetTypes.TreeNode.newBuilder - .setIsValidNode( - DatasetTypes.IsValidNode.newBuilder.setArgs( - DatasetTypes.TreeNode.newBuilder.setFieldNode( - DatasetTypes.FieldNode.newBuilder.setName(attribute).build) - .build).build()).build()) - } - - def createIsNullNode(attribute: String): Option[TreeNode] = { - Some(DatasetTypes.TreeNode.newBuilder - .setNotNode( - DatasetTypes.NotNode.newBuilder.setArgs( - DatasetTypes.TreeNode.newBuilder - .setIsValidNode( - DatasetTypes.IsValidNode.newBuilder.setArgs( - DatasetTypes.TreeNode.newBuilder.setFieldNode( - DatasetTypes.FieldNode.newBuilder.setName(attribute).build) - .build) - .build()).build()).build()).build()) - } - - def createAndNode(left: Filter, right: Filter): Option[TreeNode] = { - val translatedLeft = translateFilter(left) - val translatedRight = translateFilter(right) - if (translatedLeft.isEmpty || translatedRight.isEmpty) { - return None - } - Some(DatasetTypes.TreeNode.newBuilder - .setAndNode(DatasetTypes.AndNode.newBuilder - .setLeftArg(translatedLeft.get) - .setRightArg(translatedRight.get) - .build()) - .build()) - } - - def createOrNode(left: Filter, right: Filter): Option[TreeNode] = { - val translatedLeft = translateFilter(left) - val translatedRight = translateFilter(right) - if (translatedLeft.isEmpty || translatedRight.isEmpty) { - return None - } - Some(DatasetTypes.TreeNode.newBuilder - .setOrNode(DatasetTypes.OrNode.newBuilder - .setLeftArg(translatedLeft.get) - .setRightArg(translatedRight.get) - .build()) - .build()) - } -} diff --git a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowOptions.scala b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowOptions.scala deleted file mode 100644 index efaf9a589ee9..000000000000 --- a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowOptions.scala +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.oap.spark.sql.execution.datasources.v2.arrow - -import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap - -class ArrowOptions(val parameters: CaseInsensitiveMap[String]) - extends Serializable { - - def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters)) - - val originalFormat = parameters - .get(ArrowOptions.KEY_ORIGINAL_FORMAT) - .getOrElse(ArrowOptions.DEFAULT_ORIGINAL_FORMAT) - val targetFormat = parameters - .get(ArrowOptions.KEY_TARGET_FORMAT) - .getOrElse(ArrowOptions.DEFAULT_TARGET_FORMAT) - - @deprecated - val filesystem = parameters - .get(ArrowOptions.KEY_FILESYSTEM) - .getOrElse(ArrowOptions.DEFAULT_FILESYSTEM) -} - -object ArrowOptions { - val KEY_ORIGINAL_FORMAT = "originalFormat" - val DEFAULT_ORIGINAL_FORMAT = "parquet" - val KEY_TARGET_FORMAT = "targetFormat" - val DEFAULT_TARGET_FORMAT = "parquet" - - @deprecated - val KEY_FILESYSTEM = "filesystem" - val DEFAULT_FILESYSTEM = "hdfs" -} diff --git a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowPartitionReaderFactory.scala b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowPartitionReaderFactory.scala deleted file mode 100644 index 99ccd781a595..000000000000 --- a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowPartitionReaderFactory.scala +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.intel.oap.spark.sql.execution.datasources.v2.arrow - -import java.net.URLDecoder - -import scala.collection.JavaConverters._ - -import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowPartitionReaderFactory.ColumnarBatchRetainer -import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowSQLConf._ -import org.apache.arrow.dataset.scanner.ScanOptions - -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader} -import org.apache.spark.sql.execution.datasources.PartitionedFile -import org.apache.spark.sql.execution.datasources.v2.FilePartitionReaderFactory -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.vectorized.ColumnarBatch -import org.apache.spark.util.SerializableConfiguration - -case class ArrowPartitionReaderFactory( - sqlConf: SQLConf, - broadcastedConf: Broadcast[SerializableConfiguration], - readDataSchema: StructType, - readPartitionSchema: StructType, - pushedFilters: Array[Filter], - options: ArrowOptions) - extends FilePartitionReaderFactory { - - private val batchSize = sqlConf.parquetVectorizedReaderBatchSize - private val enableFilterPushDown: Boolean = sqlConf.arrowFilterPushDown - - override def supportColumnarReads(partition: InputPartition): Boolean = true - - override def buildReader(partitionedFile: PartitionedFile): PartitionReader[InternalRow] = { - // disable row based read - throw new UnsupportedOperationException - } - - override def buildColumnarReader( - partitionedFile: PartitionedFile): PartitionReader[ColumnarBatch] = { - val path = partitionedFile.filePath - val factory = ArrowUtils.makeArrowDiscovery(URLDecoder.decode(path, "UTF-8"), - partitionedFile.start, partitionedFile.length, options) - val dataset = factory.finish() - val filter = if (enableFilterPushDown) { - ArrowFilters.translateFilters(ArrowFilters.pruneWithSchema(pushedFilters, readDataSchema)) - } else { - org.apache.arrow.dataset.filter.Filter.EMPTY - } - val scanOptions = new ScanOptions(readDataSchema.map(f => f.name).toArray, - filter, batchSize) - val scanner = dataset.newScan(scanOptions) - - val taskList = scanner - .scan() - .iterator() - .asScala - .toList - - val vsrItrList = taskList - .map(task => task.execute()) - - val batchItr = vsrItrList - .toIterator - .flatMap(itr => itr.asScala) - .map(batch => ArrowUtils.loadBatch(batch, partitionedFile.partitionValues, - readPartitionSchema, readDataSchema)) - - new PartitionReader[ColumnarBatch] { - val holder = new ColumnarBatchRetainer() - - override def next(): Boolean = { - holder.release() - batchItr.hasNext - } - - override def get(): ColumnarBatch = { - val batch = batchItr.next() - holder.retain(batch) - batch - } - - override def close(): Unit = { - holder.release() - vsrItrList.foreach(itr => itr.close()) - taskList.foreach(task => task.close()) - scanner.close() - dataset.close() - factory.close() - } - } - } -} - -object ArrowPartitionReaderFactory { - private class ColumnarBatchRetainer { - private var retained: Option[ColumnarBatch] = None - - def retain(batch: ColumnarBatch): Unit = { - if (retained.isDefined) { - throw new IllegalStateException - } - retained = Some(batch) - } - - def release(): Unit = { - retained.foreach(b => b.close()) - retained = None - } - } -} diff --git a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowSQLConf.scala b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowSQLConf.scala deleted file mode 100644 index 3b0cb69325dc..000000000000 --- a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowSQLConf.scala +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.oap.spark.sql.execution.datasources.v2.arrow - -import org.apache.spark.sql.internal.SQLConf - -object ArrowSQLConf { - val ARROW_FILTER_PUSHDOWN_ENABLED = SQLConf.buildConf("spark.sql.arrow.filterPushdown") - .doc("Enables Arrow filter push-down optimization when set to true.") - .booleanConf - .createWithDefault(true) - - implicit def fromSQLConf(c: SQLConf): ArrowSQLConf = { - new ArrowSQLConf(c) - } -} - -class ArrowSQLConf(c: SQLConf) { - def arrowFilterPushDown: Boolean = c.getConf(ArrowSQLConf.ARROW_FILTER_PUSHDOWN_ENABLED) -} diff --git a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowScan.scala b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowScan.scala deleted file mode 100644 index 20e069e01108..000000000000 --- a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowScan.scala +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.intel.oap.spark.sql.execution.datasources.v2.arrow - -import scala.collection.JavaConverters._ - -import org.apache.hadoop.fs.Path - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.connector.read.PartitionReaderFactory -import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex -import org.apache.spark.sql.execution.datasources.v2.FileScan -import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.util.CaseInsensitiveStringMap -import org.apache.spark.util.SerializableConfiguration - -case class ArrowScan( - sparkSession: SparkSession, - fileIndex: PartitioningAwareFileIndex, - readDataSchema: StructType, - readPartitionSchema: StructType, - pushedFilters: Array[Filter], - options: CaseInsensitiveStringMap, - partitionFilters: Seq[Expression] = Seq.empty, - dataFilters: Seq[Expression] = Seq.empty) - extends FileScan { - - override def isSplitable(path: Path): Boolean = { - ArrowUtils.isOriginalFormatSplitable( - new ArrowOptions(new CaseInsensitiveStringMap(options).asScala.toMap)) - } - - override def createReaderFactory(): PartitionReaderFactory = { - val caseSensitiveMap = options.asCaseSensitiveMap().asScala.toMap - val hconf = sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap) - val broadcastedConf = - sparkSession.sparkContext.broadcast(new SerializableConfiguration(hconf)) - ArrowPartitionReaderFactory( - sparkSession.sessionState.conf, - broadcastedConf, - readDataSchema, - readPartitionSchema, - pushedFilters, - new ArrowOptions(options.asScala.toMap)) - } - - override def withFilters(partitionFilters: Seq[Expression], - dataFilters: Seq[Expression]): FileScan = - this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters) -} diff --git a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowScanBuilder.scala b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowScanBuilder.scala deleted file mode 100644 index 6583455e030f..000000000000 --- a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowScanBuilder.scala +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.intel.oap.spark.sql.execution.datasources.v2.arrow - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.connector.read.{Scan, SupportsPushDownFilters} -import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex -import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder -import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.util.CaseInsensitiveStringMap - -case class ArrowScanBuilder( - sparkSession: SparkSession, - fileIndex: PartitioningAwareFileIndex, - schema: StructType, - dataSchema: StructType, - options: CaseInsensitiveStringMap) - extends FileScanBuilder(sparkSession, fileIndex, dataSchema) - with SupportsPushDownFilters { - - private var filters: Array[Filter] = Array.empty - private lazy val pushedArrowFilters: Array[Filter] = { - filters // todo filter validation & pushdown - } - - override def pushFilters(filters: Array[Filter]): Array[Filter] = { - this.filters = filters - this.filters - } - - override def pushedFilters: Array[Filter] = pushedArrowFilters - - override def build(): Scan = { - ArrowScan( - sparkSession, - fileIndex, - readDataSchema(), - readPartitionSchema(), - pushedFilters, - options) - } -} diff --git a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowTable.scala b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowTable.scala deleted file mode 100644 index c25864a538c0..000000000000 --- a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowTable.scala +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.intel.oap.spark.sql.execution.datasources.v2.arrow - -import org.apache.arrow.memory.AllocationListener -import org.apache.hadoop.fs.FileStatus - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.connector.read.ScanBuilder -import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} -import org.apache.spark.sql.execution.datasources.FileFormat -import org.apache.spark.sql.execution.datasources.v2.FileTable -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.util.CaseInsensitiveStringMap - -case class ArrowTable( - name: String, - sparkSession: SparkSession, - options: CaseInsensitiveStringMap, - paths: Seq[String], - userSpecifiedSchema: Option[StructType], - fallbackFileFormat: Class[_ <: FileFormat]) - extends FileTable(sparkSession, options, paths, userSpecifiedSchema) { - - override def inferSchema(files: Seq[FileStatus]): Option[StructType] = { - ArrowUtils.readSchema(files, options) - } - - override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = { - ArrowScanBuilder(sparkSession, fileIndex, schema, dataSchema, options) - } - - override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { - throw new UnsupportedOperationException // fixme implement later - } - - override def formatName: String = "ARROW" -} diff --git a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowUtils.scala b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowUtils.scala deleted file mode 100644 index 07c572cddaf1..000000000000 --- a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowUtils.scala +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.oap.spark.sql.execution.datasources.v2.arrow - -import java.net.URI -import java.time.ZoneId - -import scala.collection.JavaConverters._ - -import com.intel.oap.vectorized.ArrowWritableColumnVector -import org.apache.arrow.dataset.file.FileSystemDatasetFactory -import org.apache.arrow.vector.ipc.message.ArrowRecordBatch -import org.apache.arrow.vector.types.pojo.Schema -import org.apache.hadoop.fs.FileStatus - -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.execution.datasources.v2.arrow.{SparkMemoryUtils, SparkSchemaUtils} -import org.apache.spark.sql.execution.vectorized.ColumnVectorUtils -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.util.CaseInsensitiveStringMap -import org.apache.spark.sql.vectorized.ColumnVector -import org.apache.spark.sql.vectorized.ColumnarBatch - -object ArrowUtils { - - def readSchema(file: FileStatus, options: CaseInsensitiveStringMap): Option[StructType] = { - val factory: FileSystemDatasetFactory = - makeArrowDiscovery(file.getPath.toString, -1L, -1L, - new ArrowOptions(options.asScala.toMap)) - val schema = factory.inspect() - try { - Option(SparkSchemaUtils.fromArrowSchema(schema)) - } finally { - factory.close() - } - } - - def readSchema(files: Seq[FileStatus], options: CaseInsensitiveStringMap): Option[StructType] = { - if (files.isEmpty) { - throw new IllegalArgumentException("No input file specified") - } - readSchema(files.toList.head, options) // todo merge schema - } - - def isOriginalFormatSplitable(options: ArrowOptions): Boolean = { - val format = getFormat(options) - format match { - case _: org.apache.arrow.dataset.file.format.ParquetFileFormat => - true - case _ => - false - } - } - - def makeArrowDiscovery(file: String, startOffset: Long, length: Long, - options: ArrowOptions): FileSystemDatasetFactory = { - - val format = getFormat(options) - val allocator = SparkMemoryUtils.contextAllocator() - val factory = new FileSystemDatasetFactory(allocator, - SparkMemoryUtils.contextMemoryPool(), - format, - rewriteUri(file), - startOffset, - length) - factory - } - - def toArrowSchema(t: StructType): Schema = { - // fixme this might be platform dependent - SparkSchemaUtils.toArrowSchema(t, SparkSchemaUtils.getLocalTimezoneID()) - } - - def loadBatch(input: ArrowRecordBatch, partitionValues: InternalRow, - partitionSchema: StructType, dataSchema: StructType): ColumnarBatch = { - val rowCount: Int = input.getLength - - val vectors = try { - ArrowWritableColumnVector.loadColumns(rowCount, toArrowSchema(dataSchema), input) - } finally { - input.close() - } - val partitionColumns = ArrowWritableColumnVector.allocateColumns(rowCount, partitionSchema) - (0 until partitionColumns.length).foreach(i => { - ColumnVectorUtils.populate(partitionColumns(i), partitionValues, i) - partitionColumns(i).setValueCount(rowCount) - partitionColumns(i).setIsConstant() - }) - - val batch = new ColumnarBatch( - vectors.map(_.asInstanceOf[ColumnVector]) ++ - partitionColumns.map(_.asInstanceOf[ColumnVector]), - rowCount) - batch - } - - def getFormat( - options: ArrowOptions): org.apache.arrow.dataset.file.format.FileFormat = { - val paramMap = options.parameters.toMap.asJava - options.originalFormat match { - case "parquet" => org.apache.arrow.dataset.file.format.ParquetFileFormat.create(paramMap) - case "csv" => org.apache.arrow.dataset.file.format.CsvFileFormat.create(paramMap) - case _ => throw new IllegalArgumentException("Unrecognizable format") - } - } - - private def rewriteUri(uriStr: String): String = { - val uri = URI.create(uriStr) - if (uri.getScheme == "s3" || uri.getScheme == "s3a") { - val s3Rewritten = new URI("s3", uri.getAuthority, - uri.getPath, uri.getQuery, uri.getFragment).toString - return s3Rewritten - } - val sch = uri.getScheme match { - case "hdfs" => "hdfs" - case "file" => "file" - } - val ssp = uri.getScheme match { - case "hdfs" => uri.getRawSchemeSpecificPart - case "file" => "//" + uri.getRawSchemeSpecificPart - } - val rewritten = new URI(sch, ssp, uri.getFragment) - rewritten.toString - } -} diff --git a/arrow-data-source/standard/src/test/resources/cars.csv b/arrow-data-source/standard/src/test/resources/cars.csv deleted file mode 100644 index 40ded573ade5..000000000000 --- a/arrow-data-source/standard/src/test/resources/cars.csv +++ /dev/null @@ -1,7 +0,0 @@ - -year,make,model,comment,blank -"2012","Tesla","S","No comment", - -1997,Ford,E350,"Go get one now they are going fast", -2015,Chevy,Volt - diff --git a/arrow-data-source/standard/src/test/resources/example-tab.csv b/arrow-data-source/standard/src/test/resources/example-tab.csv deleted file mode 100644 index f86f114fb45d..000000000000 --- a/arrow-data-source/standard/src/test/resources/example-tab.csv +++ /dev/null @@ -1,35 +0,0 @@ -id1 id2 id3 id4 id5 id6 v1 v2 v3 -id016 id016 id0000042202 15 24 5971 5 11 37.211254 -id039 id045 id0000029558 40 49 39457 5 4 48.951141 -id047 id023 id0000071286 68 20 74463 2 14 60.469241 -id043 id057 id0000015141 32 43 63743 1 15 7.692145 -id054 id040 id0000011083 9 25 16920 2 9 22.863525 -id029 id020 id0000017974 40 43 14435 3 13 87.521355 -id047 id023 id0000084849 90 96 35790 2 9 93.348148 -id091 id022 id0000031441 50 44 71525 3 11 81.013682 -id090 id048 id0000067778 24 2 51862 4 9 30.718821 -id070 id008 id0000091167 78 4 23333 5 15 70.95464 -id039 id084 id0000013708 94 81 44406 1 3 54.368009 -id023 id061 id0000011331 36 67 86498 5 2 13.847979 -id070 id054 id0000099110 24 15 47054 4 2 92.057305 -id022 id008 id0000038862 38 92 63088 3 10 33.517765 -id020 id070 id0000028952 17 57 50831 4 15 48.060814 -id078 id022 id0000082008 69 44 15891 1 4 95.75571 -id024 id033 id0000074157 1 57 83341 2 1 72.118722 -id053 id076 id0000061759 55 43 59469 5 10 10.574836 -id058 id087 id0000094028 14 49 72962 4 4 37.914258 -id095 id091 id0000066931 35 20 98979 3 3 16.733062 -id054 id061 id0000004843 69 58 14096 4 13 53.746802 -id019 id078 id0000047661 5 33 13347 5 5 95.013936 -id086 id088 id0000039469 45 86 65332 3 11 65.71087 -id021 id055 id0000035603 96 97 36475 4 9 90.835613 -id004 id034 id0000008260 99 8 73046 3 11 69.540405 -id053 id052 id0000008764 47 13 49231 1 15 32.039599 -id014 id050 id0000066034 45 32 33268 2 3 93.752279 -id099 id057 id0000062408 27 7 63984 5 6 77.454794 -id013 id067 id0000046109 69 90 21214 4 6 83.899656 -id042 id043 id0000025883 64 21 85711 4 14 84.141247 -id024 id062 id0000026824 79 16 49757 2 10 15.822967 -id058 id077 id0000016555 71 8 24728 3 9 92.085521 -id053 id012 id0000005595 73 28 79781 2 10 6.053862 -id100 id096 id0000073858 11 9 25962 1 10 87.268781 diff --git a/arrow-data-source/standard/src/test/resources/example.csv b/arrow-data-source/standard/src/test/resources/example.csv deleted file mode 100644 index 670d7c25ce57..000000000000 --- a/arrow-data-source/standard/src/test/resources/example.csv +++ /dev/null @@ -1,35 +0,0 @@ -id1,id2,id3,id4,id5,id6,v1,v2,v3 -id016,id016,id0000042202,15,24,5971,5,11,37.211254 -id039,id045,id0000029558,40,49,39457,5,4,48.951141 -id047,id023,id0000071286,68,20,74463,2,14,60.469241 -id043,id057,id0000015141,32,43,63743,1,15,7.692145 -id054,id040,id0000011083,9,25,16920,2,9,22.863525 -id029,id020,id0000017974,40,43,14435,3,13,87.521355 -id047,id023,id0000084849,90,96,35790,2,9,93.348148 -id091,id022,id0000031441,50,44,71525,3,11,81.013682 -id090,id048,id0000067778,24,2,51862,4,9,30.718821 -id070,id008,id0000091167,78,4,23333,5,15,70.95464 -id039,id084,id0000013708,94,81,44406,1,3,54.368009 -id023,id061,id0000011331,36,67,86498,5,2,13.847979 -id070,id054,id0000099110,24,15,47054,4,2,92.057305 -id022,id008,id0000038862,38,92,63088,3,10,33.517765 -id020,id070,id0000028952,17,57,50831,4,15,48.060814 -id078,id022,id0000082008,69,44,15891,1,4,95.75571 -id024,id033,id0000074157,1,57,83341,2,1,72.118722 -id053,id076,id0000061759,55,43,59469,5,10,10.574836 -id058,id087,id0000094028,14,49,72962,4,4,37.914258 -id095,id091,id0000066931,35,20,98979,3,3,16.733062 -id054,id061,id0000004843,69,58,14096,4,13,53.746802 -id019,id078,id0000047661,5,33,13347,5,5,95.013936 -id086,id088,id0000039469,45,86,65332,3,11,65.71087 -id021,id055,id0000035603,96,97,36475,4,9,90.835613 -id004,id034,id0000008260,99,8,73046,3,11,69.540405 -id053,id052,id0000008764,47,13,49231,1,15,32.039599 -id014,id050,id0000066034,45,32,33268,2,3,93.752279 -id099,id057,id0000062408,27,7,63984,5,6,77.454794 -id013,id067,id0000046109,69,90,21214,4,6,83.899656 -id042,id043,id0000025883,64,21,85711,4,14,84.141247 -id024,id062,id0000026824,79,16,49757,2,10,15.822967 -id058,id077,id0000016555,71,8,24728,3,9,92.085521 -id053,id012,id0000005595,73,28,79781,2,10,6.053862 -id100,id096,id0000073858,11,9,25962,1,10,87.268781 diff --git a/arrow-data-source/standard/src/test/resources/people.csv b/arrow-data-source/standard/src/test/resources/people.csv deleted file mode 100644 index 4d9b27bf9ac8..000000000000 --- a/arrow-data-source/standard/src/test/resources/people.csv +++ /dev/null @@ -1,3 +0,0 @@ -name,age,job -Jorge,30,Developer -Bob,32,Developer diff --git a/arrow-data-source/standard/src/test/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowDataSourceTPCHBasedTest.scala b/arrow-data-source/standard/src/test/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowDataSourceTPCHBasedTest.scala deleted file mode 100644 index b33d4a1d32cb..000000000000 --- a/arrow-data-source/standard/src/test/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowDataSourceTPCHBasedTest.scala +++ /dev/null @@ -1,286 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.intel.oap.spark.sql.execution.datasources.arrow - -import java.util.concurrent.{Executors, TimeUnit} - -import com.intel.oap.spark.sql.DataFrameReaderImplicits._ -import com.intel.oap.spark.sql.execution.datasources.v2.arrow.{ArrowOptions, ArrowUtils} - -import org.apache.spark.SparkConf -import org.apache.spark.sql.QueryTest -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.test.SharedSparkSession - -class ArrowDataSourceTPCHBasedTest extends QueryTest with SharedSparkSession { - - // tpc-h query cases: generated tpc-h dataset required - private val prefix = "/root/Downloads/" - private val tpchFolder = "date_tpch_10" - private val lineitem = prefix + tpchFolder + "/lineitem" - private val part = prefix + tpchFolder + "/part" - private val partSupp = prefix + tpchFolder + "/partsupp" - private val supplier = prefix + tpchFolder + "/supplier" - private val orders = prefix + tpchFolder + "/orders" - private val nation = prefix + tpchFolder + "/nation" - - - override protected def sparkConf: SparkConf = { - val conf = super.sparkConf - conf.set("spark.memory.offHeap.size", String.valueOf(128 * 1024 * 1024)) - conf - } - - ignore("tpch lineitem - desc") { - val frame = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") - .arrow(lineitem) - frame.createOrReplaceTempView("lineitem") - - spark.sql("describe lineitem").show() - } - - ignore("tpch part - special characters in path") { - val frame = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") - .arrow(part) - frame.createOrReplaceTempView("part") - - spark.sql("select * from part limit 100").show() - } - - ignore("tpch lineitem - read partition values") { - val frame = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") - .arrow(orders) - frame.createOrReplaceTempView("orders") - - spark.sql("select o_orderdate from orders limit 100").show() - } - - ignore("tpch lineitem - asterisk select") { - val frame = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") - .arrow(lineitem) - frame.createOrReplaceTempView("lineitem") - - spark.sql("select * from lineitem limit 10").show() - } - - ignore("tpch query 6") { - val frame = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") - .arrow(lineitem) - frame.createOrReplaceTempView("lineitem") - - spark.sql("select\n\tsum(l_extendedprice * l_discount) as revenue\n" + - "from\n\tlineitem\n" + - "where\n\tl_shipdate >= date '1994-01-01'\n\t" + - "and l_shipdate < date '1994-01-01' + interval '1' year\n\t" + - "and l_discount between .06 - 0.01 and .06 + 0.01\n\t" + - "and l_quantity < 24").show() - } - - ignore("tpch query 6 - performance comparision") { - val iterations = 10 - withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "false") { - val frame1 = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") - .arrow(lineitem) - frame1.createOrReplaceTempView("lineitem_arrow") - - val frame2 = spark.read - .parquet(lineitem) - frame2.createOrReplaceTempView("lineitem_parquet") - - val pPrev = System.currentTimeMillis() - (0 until iterations).foreach(_ => - spark.sql("select\n\tsum(l_extendedprice * l_discount) as revenue\n" + - "from\n\tlineitem_parquet\n" + - "where\n\tl_shipdate >= date '1994-01-01'\n\t" + - "and l_shipdate < date '1994-01-01' + interval '1' year\n\t" + - "and l_discount between .06 - 0.01 and .06 + 0.01\n\t" + - "and l_quantity < 24").show() - ) - val parquetExecTime = System.currentTimeMillis() - pPrev - - val aPrev = System.currentTimeMillis() - (0 until iterations).foreach(_ => { - // scalastyle:off println - println(SparkMemoryUtils.contextAllocator().getAllocatedMemory) - // scalastyle:on println - spark.sql("select\n\tsum(l_extendedprice * l_discount) as revenue\n" + - "from\n\tlineitem_arrow\n" + - "where\n\tl_shipdate >= date '1994-01-01'\n\t" + - "and l_shipdate < date '1994-01-01' + interval '1' year\n\t" + - "and l_discount between .06 - 0.01 and .06 + 0.01\n\t" + - "and l_quantity < 24").show() - } - ) - val arrowExecTime = System.currentTimeMillis() - aPrev - - // unstable assert - assert(arrowExecTime < parquetExecTime) - } - } - - ignore("tpch query 16 - performance comparision") { - val iterations = 1 - withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "false") { - val frame1 = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") - .arrow(partSupp) - frame1.createOrReplaceTempView("partsupp_arrow") - - val frame2 = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") - .arrow(part) - frame2.createOrReplaceTempView("part_arrow") - - val frame3 = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") - .arrow(supplier) - frame3.createOrReplaceTempView("supplier_arrow") - - val frame4 = spark.read - .parquet(partSupp) - frame4.createOrReplaceTempView("partsupp_parquet") - - val frame5 = spark.read - .parquet(part) - frame5.createOrReplaceTempView("part_parquet") - - val frame6 = spark.read - .parquet(supplier) - frame6.createOrReplaceTempView("supplier_parquet") - - val pPrev = System.currentTimeMillis() - (0 until iterations).foreach(_ => - spark.sql("select\n\tp_brand,\n\tp_type,\n\tp_size," + - "\n\tcount(distinct ps_suppkey) as supplier_cnt\n" + - "from\n\tpartsupp_parquet,\n\tpart_parquet\nwhere\n\tp_partkey" + - " = ps_partkey\n\tand p_brand <> 'Brand#45'\n\t" + - "and p_type not like 'MEDIUM POLISHED%'\n\tand p_size in " + - "(49, 14, 23, 45, 19, 3, 36, 9)\n\t" + - "and ps_suppkey not in (\n\t\tselect\n\t\t\ts_suppkey\n\t\t" + - "from\n\t\t\tsupplier_parquet\n\t\twhere\n\t\t\t" + - "s_comment like '%Customer%Complaints%'\n\t)\ngroup by\n\t" + - "p_brand,\n\tp_type,\n\tp_size\norder by\n\t" + - "supplier_cnt desc,\n\tp_brand,\n\tp_type,\n\tp_size").show() - ) - val parquetExecTime = System.currentTimeMillis() - pPrev - - val aPrev = System.currentTimeMillis() - (0 until iterations).foreach(_ => - spark.sql("select\n\tp_brand,\n\tp_type,\n\tp_size," + - "\n\tcount(distinct ps_suppkey) as supplier_cnt\n" + - "from\n\tpartsupp_arrow,\n\tpart_arrow\nwhere\n\tp_partkey" + - " = ps_partkey\n\tand p_brand <> 'Brand#45'\n\t" + - "and p_type not like 'MEDIUM POLISHED%'\n\tand p_size in " + - "(49, 14, 23, 45, 19, 3, 36, 9)\n\t" + - "and ps_suppkey not in (\n\t\tselect\n\t\t\ts_suppkey\n\t\t" + - "from\n\t\t\tsupplier_arrow\n\t\twhere\n\t\t\t" + - "s_comment like '%Customer%Complaints%'\n\t)\ngroup by\n\t" + - "p_brand,\n\tp_type,\n\tp_size\norder by\n\t" + - "supplier_cnt desc,\n\tp_brand,\n\tp_type,\n\tp_size").show() - ) - val arrowExecTime = System.currentTimeMillis() - aPrev - - // scalastyle:off println - println(arrowExecTime) - println(parquetExecTime) - // scalastyle:on println - // unstable assert - assert(arrowExecTime < parquetExecTime) - } - } - - ignore("tpch query 1") { - val frame = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") - .arrow(lineitem) - frame.createOrReplaceTempView("lineitem") - - spark.sql("select\n\tl_returnflag,\n\tl_linestatus," + - "\n\tsum(l_quantity) as sum_qty,\n\t" + - "sum(l_extendedprice) as sum_base_price," + - "\n\tsum(l_extendedprice * (1 - l_discount)) as sum_disc_price,\n\t" + - "sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge," + - "\n\tavg(l_quantity) as avg_qty,\n\t" + - "avg(l_extendedprice) as avg_price,\n\tavg(l_discount) as avg_disc," + - "\n\tcount(*) as count_order\nfrom\n\t" + - "lineitem\nwhere\n\tl_shipdate <= date '1998-12-01' - interval '90' day" + - "\ngroup by\n\tl_returnflag,\n\t" + - "l_linestatus\norder by\n\tl_returnflag,\n\tl_linestatus").explain(true) - } - - ignore("tpch query 21 - memory leak") { - val frame1 = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") - .arrow(supplier) - frame1.createOrReplaceTempView("supplier") - val frame2 = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") - .arrow(lineitem) - frame2.createOrReplaceTempView("lineitem") - val frame3 = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") - .arrow(orders) - frame3.createOrReplaceTempView("orders") - val frame4 = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") - .arrow(nation) - frame4.createOrReplaceTempView("nation") - - Executors.newSingleThreadExecutor().execute(() => { - spark.sql("select\n\ts_name,\n\tcount(*) as numwait\nfrom\n\tsupplier,\n\t" + - "lineitem l1,\n\torders,\n\tnation\nwhere\n\ts_suppkey = l1.l_suppkey\n\t" + - "and o_orderkey = l1.l_orderkey\n\tand o_orderstatus = 'F'\n\tand " + - "l1.l_receiptdate > l1.l_commitdate\n\tand exists (\n\t\tselect\n\t\t\t*\n\t\tfrom\n\t\t\t" + - "lineitem l2\n\t\twhere\n\t\t\tl2.l_orderkey = l1.l_orderkey\n\t\t\tand " + - "l2.l_suppkey <> l1.l_suppkey\n\t)\n\tand not exists (\n\t\tselect\n\t\t\t*\n\t\t" + - "from\n\t\t\tlineitem l3\n\t\twhere\n\t\t\tl3.l_orderkey = l1.l_orderkey\n\t\t\t" + - "and l3.l_suppkey <> l1.l_suppkey\n\t\t\tand l3.l_receiptdate > " + - "l3.l_commitdate\n\t)\n\tand s_nationkey = n_nationkey\n\tand n_name = 'SAUDI ARABIA'\n" + - "group by\n\ts_name\norder by\n\tnumwait desc,\n\t" + - "s_name\nlimit 100").show() - }) - Executors.newSingleThreadScheduledExecutor().scheduleWithFixedDelay(() => { - println("[org.apache.spark.sql.util.ArrowUtils.rootAllocator] " + - "Allocated memory amount: " + SparkMemoryUtils.contextAllocator()) - println("[com.intel.oap.vectorized.ArrowWritableColumnVector.allocator] " + - "Allocated memory amount: " + SparkMemoryUtils.contextAllocator().getAllocatedMemory) - }, 0L, 100L, TimeUnit.MILLISECONDS) - Thread.sleep(60 * 60 * 1000L) - } - -} diff --git a/arrow-data-source/standard/src/test/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowDataSourceTest.scala b/arrow-data-source/standard/src/test/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowDataSourceTest.scala deleted file mode 100644 index 5ad7596b901d..000000000000 --- a/arrow-data-source/standard/src/test/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowDataSourceTest.scala +++ /dev/null @@ -1,373 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.oap.spark.sql.execution.datasources.arrow - -import java.io.File -import java.lang.management.ManagementFactory - -import com.intel.oap.spark.sql.ArrowWriteExtension -import com.intel.oap.spark.sql.DataFrameReaderImplicits._ -import com.intel.oap.spark.sql.DataFrameWriterImplicits._ -import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowOptions -import com.sun.management.UnixOperatingSystemMXBean -import org.apache.commons.io.FileUtils - -import org.apache.spark.SparkConf -import org.apache.spark.sql.SaveMode -import org.apache.spark.sql.{DataFrame, QueryTest, Row} -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils -import org.apache.spark.sql.functions.col -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.internal.StaticSQLConf.SPARK_SESSION_EXTENSIONS -import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} - -class ArrowDataSourceTest extends QueryTest with SharedSparkSession { - private val parquetFile1 = "parquet-1.parquet" - private val parquetFile2 = "parquet-2.parquet" - private val parquetFile3 = "parquet-3.parquet" - private val parquetFile4 = "parquet-4.parquet" - private val parquetFile5 = "parquet-5.parquet" - - override protected def sparkConf: SparkConf = { - val conf = super.sparkConf - conf.set("spark.memory.offHeap.size", String.valueOf(10 * 1024 * 1024)) - conf.set("spark.unsafe.exceptionOnMemoryLeak", "false") - conf.set(SPARK_SESSION_EXTENSIONS.key, classOf[ArrowWriteExtension].getCanonicalName) - conf - } - - override def beforeAll(): Unit = { - super.beforeAll() - import testImplicits._ - spark.read - .json(Seq("{\"col\": -1}", "{\"col\": 0}", "{\"col\": 1}", "{\"col\": 2}", "{\"col\": null}") - .toDS()) - .repartition(1) - .write - .mode("overwrite") - .parquet(ArrowDataSourceTest.locateResourcePath(parquetFile1)) - - spark.read - .json(Seq("{\"col\": \"a\"}", "{\"col\": \"b\"}") - .toDS()) - .repartition(1) - .write - .mode("overwrite") - .parquet(ArrowDataSourceTest.locateResourcePath(parquetFile2)) - - spark.read - .json(Seq("{\"col1\": \"apple\", \"col2\": 100}", "{\"col1\": \"pear\", \"col2\": 200}", - "{\"col1\": \"apple\", \"col2\": 300}") - .toDS()) - .repartition(1) - .write - .mode("overwrite") - .parquet(ArrowDataSourceTest.locateResourcePath(parquetFile3)) - - spark.range(1000) - .select(col("id"), col("id").as("k")) - .write - .partitionBy("k") - .format("parquet") - .mode("overwrite") - .parquet(ArrowDataSourceTest.locateResourcePath(parquetFile4)) - - spark.range(100) - .select(col("id"), col("id").as("k")) - .write - .partitionBy("k") - .format("parquet") - .mode("overwrite") - .parquet(ArrowDataSourceTest.locateResourcePath(parquetFile5)) - - } - - override def afterAll(): Unit = { - delete(ArrowDataSourceTest.locateResourcePath(parquetFile1)) - delete(ArrowDataSourceTest.locateResourcePath(parquetFile2)) - delete(ArrowDataSourceTest.locateResourcePath(parquetFile3)) - delete(ArrowDataSourceTest.locateResourcePath(parquetFile4)) - delete(ArrowDataSourceTest.locateResourcePath(parquetFile5)) - super.afterAll() - } - - test("read parquet file") { - val path = ArrowDataSourceTest.locateResourcePath(parquetFile1) - verifyFrame( - spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .arrow(path), 5, 1) - } - - ignore("simple sql query on s3") { - val path = "s3a://mlp-spark-dataset-bucket/test_arrowds_s3_small" - val frame = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .arrow(path) - frame.createOrReplaceTempView("stab") - assert(spark.sql("select id from stab").count() === 1000) - } - - test("create catalog table") { - val path = ArrowDataSourceTest.locateResourcePath(parquetFile1) - spark.catalog.createTable("ptab", path, "arrow") - val sql = "select * from ptab" - spark.sql(sql).explain() - verifyFrame(spark.sql(sql), 5, 1) - } - - test("create table statement") { - spark.sql("drop table if exists ptab") - spark.sql("create table ptab (col1 varchar(14), col2 bigint, col3 bigint) " + - "using arrow " + - "partitioned by (col1)") - spark.sql("select * from ptab") - } - - test("simple SQL query on parquet file - 1") { - val path = ArrowDataSourceTest.locateResourcePath(parquetFile1) - val frame = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .arrow(path) - frame.createOrReplaceTempView("ptab") - verifyFrame(spark.sql("select * from ptab"), 5, 1) - verifyFrame(spark.sql("select col from ptab"), 5, 1) - verifyFrame(spark.sql("select col from ptab where col is not null or col is null"), - 5, 1) - } - - test("simple SQL query on parquet file - 2") { - val path = ArrowDataSourceTest.locateResourcePath(parquetFile3) - val frame = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .arrow(path) - frame.createOrReplaceTempView("ptab") - val sqlFrame = spark.sql("select * from ptab") - assert( - sqlFrame.schema === - StructType(Seq(StructField("col1", StringType), StructField("col2", LongType)))) - val rows = sqlFrame.collect() - assert(rows(0).get(0) == "apple") - assert(rows(0).get(1) == 100) - assert(rows(1).get(0) == "pear") - assert(rows(1).get(1) == 200) - assert(rows(2).get(0) == "apple") - assert(rows(2).get(1) == 300) - assert(rows.length === 3) - } - - test("simple parquet write") { - val path = ArrowDataSourceTest.locateResourcePath(parquetFile3) - val frame = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .arrow(path) - frame.createOrReplaceTempView("ptab") - val sqlFrame = spark.sql("select * from ptab") - - val writtenPath = FileUtils.getTempDirectory + File.separator + "written.parquet" - sqlFrame.write.mode(SaveMode.Overwrite) - .option(ArrowOptions.KEY_TARGET_FORMAT, "parquet") - .arrow(writtenPath) - - val frame2 = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .arrow(writtenPath) - frame2.createOrReplaceTempView("ptab2") - val sqlFrame2 = spark.sql("select * from ptab2") - - verifyFrame(sqlFrame2, 3, 2) - } - - test("simple SQL query on parquet file with pushed filters") { - val path = ArrowDataSourceTest.locateResourcePath(parquetFile1) - val frame = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .arrow(path) - frame.createOrReplaceTempView("ptab") - spark.sql("select col from ptab where col = 1").explain(true) - val result = spark.sql("select col from ptab where col = 1") // fixme rowcount == 2? - assert( - result.schema === - StructType(Seq(StructField("col", LongType)))) - assert(result.collect().length === 1) - } - - test("ignore unrecognizable types when pushing down filters") { - val path = ArrowDataSourceTest.locateResourcePath(parquetFile2) - val frame = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .arrow(path) - frame.createOrReplaceTempView("ptab") - val rows = spark.sql("select * from ptab where col = 'b'").collect() - assert(rows.length === 1) - } - - ignore("dynamic partition pruning") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", - SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false", - SQLConf.USE_V1_SOURCE_LIST.key -> "arrow", - SQLConf.CBO_ENABLED.key -> "true") { - - var path: String = null - path = ArrowDataSourceTest.locateResourcePath(parquetFile4) - spark.catalog.createTable("df1", path, "arrow") - path = ArrowDataSourceTest.locateResourcePath(parquetFile5) - spark.catalog.createTable("df2", path, "arrow") - - sql("ALTER TABLE df1 RECOVER PARTITIONS") - sql("ALTER TABLE df2 RECOVER PARTITIONS") - - sql("ANALYZE TABLE df1 COMPUTE STATISTICS FOR COLUMNS id") - sql("ANALYZE TABLE df2 COMPUTE STATISTICS FOR COLUMNS id") - - val df = sql("SELECT df1.id, df2.k FROM df1 " + - "JOIN df2 ON df1.k = df2.k AND df2.id < 2") - assert(df.queryExecution.executedPlan.toString().contains("dynamicpruningexpression")) - checkAnswer(df, Row(0, 0) :: Row(1, 1) :: Nil) - } - } - - test("count(*) without group by v2") { - val path = ArrowDataSourceTest.locateResourcePath(parquetFile1) - val frame = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .arrow(path) - frame.createOrReplaceTempView("ptab") - val df = sql("SELECT COUNT(*) FROM ptab") - checkAnswer(df, Row(5) :: Nil) - - } - - test("file descriptor leak") { - val path = ArrowDataSourceTest.locateResourcePath(parquetFile1) - val frame = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .arrow(path) - frame.createOrReplaceTempView("ptab") - - def getFdCount: Long = { - ManagementFactory.getOperatingSystemMXBean - .asInstanceOf[UnixOperatingSystemMXBean] - .getOpenFileDescriptorCount - } - - val initialFdCount = getFdCount - for (_ <- 0 until 100) { - verifyFrame(spark.sql("select * from ptab"), 5, 1) - } - val fdGrowth = getFdCount - initialFdCount - assert(fdGrowth < 100) - } - - test("file descriptor leak - v1") { - val path = ArrowDataSourceTest.locateResourcePath(parquetFile1) - val frame = spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .arrow(path) - frame.createOrReplaceTempView("ptab2") - - def getFdCount: Long = { - ManagementFactory.getOperatingSystemMXBean - .asInstanceOf[UnixOperatingSystemMXBean] - .getOpenFileDescriptorCount - } - - val initialFdCount = getFdCount - for (_ <- 0 until 100) { - verifyFrame(spark.sql("select * from ptab2"), 5, 1) - } - val fdGrowth = getFdCount - initialFdCount - assert(fdGrowth < 100) - } - - private val csvFile1 = "people.csv" - private val csvFile2 = "example.csv" - private val csvFile3 = "example-tab.csv" - - ignore("read csv file without specifying original format") { - // not implemented - verifyFrame(spark.read.format("arrow") - .load(ArrowDataSourceTest.locateResourcePath(csvFile1)), 1, 2) - } - - test("read csv file") { - val path = ArrowDataSourceTest.locateResourcePath(csvFile1) - verifyFrame( - spark.read - .format("arrow") - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "csv") - .load(path), 2, 3) - } - - test("read csv file 2") { - val path = ArrowDataSourceTest.locateResourcePath(csvFile2) - verifyFrame( - spark.read - .format("arrow") - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "csv") - .load(path), 34, 9) - } - - test("read csv file 3 - tab separated") { - val path = ArrowDataSourceTest.locateResourcePath(csvFile3) - verifyFrame( - spark.read - .format("arrow") - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "csv") - .option("delimiter", "\t") - .load(path), 34, 9) - } - - test("read csv file - programmatic API ") { - val path = ArrowDataSourceTest.locateResourcePath(csvFile1) - verifyFrame( - spark.read - .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "csv") - .arrow(path), 2, 3) - } - - def verifyFrame(frame: DataFrame, rowCount: Int, columnCount: Int): Unit = { - assert(frame.schema.length === columnCount) - assert(frame.collect().length === rowCount) - } - - def verifyCsv(frame: DataFrame): Unit = { - // todo assert something - } - - def verifyParquet(frame: DataFrame): Unit = { - verifyFrame(frame, 5, 1) - } - - def delete(path: String): Unit = { - FileUtils.forceDelete(new File(path)) - } - - def closeAllocators(): Unit = { - SparkMemoryUtils.contextAllocator().close() - } -} - -object ArrowDataSourceTest { - private def locateResourcePath(resource: String): String = { - classOf[ArrowDataSourceTest].getClassLoader.getResource("") - .getPath.concat(File.separator).concat(resource) - } -} diff --git a/native-sql-engine/core/pom.xml b/native-sql-engine/core/pom.xml index 1786ff874699..c059b6e5cb16 100644 --- a/native-sql-engine/core/pom.xml +++ b/native-sql-engine/core/pom.xml @@ -140,16 +140,28 @@ - spark-arrow-datasource-common - com.intel.oap - ${project.version} - provided - - - spark-arrow-datasource-standard - com.intel.oap - ${project.version} - provided + org.apache.arrow + arrow-dataset + ${arrow.version} + + + io.netty + netty-common + + + io.netty + netty-buffer + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-annotations + + + compile org.scalacheck diff --git a/arrow-data-source/common/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/NativeSQLMemoryConsumer.java b/native-sql-engine/core/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/NativeSQLMemoryConsumer.java similarity index 100% rename from arrow-data-source/common/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/NativeSQLMemoryConsumer.java rename to native-sql-engine/core/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/NativeSQLMemoryConsumer.java diff --git a/arrow-data-source/common/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/NativeSQLMemoryMetrics.java b/native-sql-engine/core/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/NativeSQLMemoryMetrics.java similarity index 100% rename from arrow-data-source/common/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/NativeSQLMemoryMetrics.java rename to native-sql-engine/core/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/NativeSQLMemoryMetrics.java diff --git a/native-sql-engine/core/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/SparkManagedAllocationListener.java b/native-sql-engine/core/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/SparkManagedAllocationListener.java new file mode 100644 index 000000000000..14ed5a7da608 --- /dev/null +++ b/native-sql-engine/core/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/SparkManagedAllocationListener.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.intel.oap.spark.sql.execution.datasources.v2.arrow; + +import org.apache.arrow.memory.AllocationListener; + +public class SparkManagedAllocationListener implements AllocationListener { + public static long BLOCK_SIZE = 8L * 1024 * 1024; // 8MB per block + + private final NativeSQLMemoryConsumer consumer; + private final NativeSQLMemoryMetrics metrics; + + private long bytesReserved = 0L; + private long blocksReserved = 0L; + + public SparkManagedAllocationListener(NativeSQLMemoryConsumer consumer, NativeSQLMemoryMetrics metrics) { + this.consumer = consumer; + this.metrics = metrics; + } + + @Override + public void onPreAllocation(long size) { + long requiredBlocks = updateReservation(size); + if (requiredBlocks < 0) { + throw new IllegalStateException(); + } + if (requiredBlocks == 0) { + return; + } + long toBeAcquired = requiredBlocks * BLOCK_SIZE; + consumer.acquire(toBeAcquired); + metrics.inc(toBeAcquired); + } + + @Override + public void onRelease(long size) { + long requiredBlocks = updateReservation(-size); + if (requiredBlocks > 0) { + throw new IllegalStateException(); + } + if (requiredBlocks == 0) { + return; + } + long toBeReleased = -requiredBlocks * BLOCK_SIZE; + consumer.free(toBeReleased); + metrics.inc(-toBeReleased); + } + + public long updateReservation(long bytesToAdd) { + synchronized (this) { + long newBytesReserved = bytesReserved + bytesToAdd; + final long newBlocksReserved; + // ceiling + if (newBytesReserved == 0L) { + // 0 is the special case in ceiling algorithm + newBlocksReserved = 0L; + } else { + newBlocksReserved = (newBytesReserved - 1L) / BLOCK_SIZE + 1L; + } + long requiredBlocks = newBlocksReserved - blocksReserved; + bytesReserved = newBytesReserved; + blocksReserved = newBlocksReserved; + return requiredBlocks; + } + } +} diff --git a/arrow-data-source/common/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/SparkManagedReservationListener.java b/native-sql-engine/core/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/SparkManagedReservationListener.java similarity index 100% rename from arrow-data-source/common/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/SparkManagedReservationListener.java rename to native-sql-engine/core/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/SparkManagedReservationListener.java diff --git a/arrow-data-source/common/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/Spiller.java b/native-sql-engine/core/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/Spiller.java similarity index 100% rename from arrow-data-source/common/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/Spiller.java rename to native-sql-engine/core/src/main/java/com/intel/oap/spark/sql/execution/datasources/v2/arrow/Spiller.java diff --git a/arrow-data-source/common/src/main/java/com/intel/oap/vectorized/ArrowWritableColumnVector.java b/native-sql-engine/core/src/main/java/com/intel/oap/vectorized/ArrowWritableColumnVector.java similarity index 100% rename from arrow-data-source/common/src/main/java/com/intel/oap/vectorized/ArrowWritableColumnVector.java rename to native-sql-engine/core/src/main/java/com/intel/oap/vectorized/ArrowWritableColumnVector.java diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/ArrowColumnarToRowExec.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/ArrowColumnarToRowExec.scala index b13f623fac9d..cb3a120e2ae5 100644 --- a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/ArrowColumnarToRowExec.scala +++ b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/ArrowColumnarToRowExec.scala @@ -20,7 +20,6 @@ package com.intel.oap.execution import com.intel.oap.expression.ConverterUtils import com.intel.oap.vectorized.{ArrowColumnarToRowJniWrapper, ArrowWritableColumnVector} import org.apache.arrow.vector.types.pojo.{Field, Schema} - import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow} diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/BasicPhysicalOperatorTransformer.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/BasicPhysicalOperatorTransformer.scala index cb4effdae041..908bedc8486c 100644 --- a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/BasicPhysicalOperatorTransformer.scala +++ b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/BasicPhysicalOperatorTransformer.scala @@ -38,7 +38,6 @@ import com.google.common.collect.Lists import com.intel.oap.GazellePluginConfig import com.intel.oap.substrait.expression.ExpressionNode import com.intel.oap.substrait.rel.{RelBuilder, RelNode} -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils; case class ConditionProjectExecTransformer( condition: Expression, diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/BatchScanExecTransformer.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/BatchScanExecTransformer.scala index 8ed911473e20..ae5828fb8ab4 100644 --- a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/BatchScanExecTransformer.scala +++ b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/BatchScanExecTransformer.scala @@ -25,10 +25,9 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory, Scan} import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.datasources.v2.BatchScanExec +import org.apache.spark.sql.execution.datasources.v2.{BatchScanExec, FileScan} import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch} -import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowScan import com.intel.oap.substrait.`type`.TypeBuiler import com.intel.oap.substrait.expression.{ExpressionBuilder, ExpressionNode} import org.apache.spark.sql.execution.datasources.FilePartition @@ -38,12 +37,8 @@ import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan class BatchScanExecTransformer(output: Seq[AttributeReference], @transient scan: Scan) extends BatchScanExec(output, scan) with TransformSupport { val tmpDir: String = GazellePluginConfig.getConf.tmpFile - val filterExprs: Seq[Expression] = if (scan.isInstanceOf[ParquetScan]) { - scan.asInstanceOf[ParquetScan].dataFilters - } else if (scan.isInstanceOf[OrcScan]) { - scan.asInstanceOf[OrcScan].dataFilters - } else if (scan.isInstanceOf[ArrowScan]) { - scan.asInstanceOf[ArrowScan].dataFilters + val filterExprs: Seq[Expression] = if (scan.isInstanceOf[FileScan]) { + scan.asInstanceOf[FileScan].dataFilters } else { throw new UnsupportedOperationException(s"${scan.getClass.toString} is not supported") } @@ -57,18 +52,7 @@ class BatchScanExecTransformer(output: Seq[AttributeReference], @transient scan: "inputSize" -> SQLMetrics.createSizeMetric(sparkContext, "input size in bytes")) override def doExecuteColumnar(): RDD[ColumnarBatch] = { - val numOutputRows = longMetric("numOutputRows") - val numInputBatches = longMetric("numInputBatches") - val numOutputBatches = longMetric("numOutputBatches") - val scanTime = longMetric("scanTime") - val inputSize = longMetric("inputSize") - val inputColumnarRDD = - new ColumnarDataSourceRDD(sparkContext, partitions, readerFactory, true, scanTime, numInputBatches, inputSize, tmpDir) - inputColumnarRDD.map { r => - numOutputRows += r.numRows() - numOutputBatches += 1 - r - } + throw new UnsupportedOperationException(s"This operator doesn't support doExecuteColumnar().") } override def canEqual(other: Any): Boolean = other.isInstanceOf[BatchScanExecTransformer] diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/BroadcastHashJoinExecTransformer.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/BroadcastHashJoinExecTransformer.scala index c1606f3f8ed4..b474453b411e 100644 --- a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/BroadcastHashJoinExecTransformer.scala +++ b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/BroadcastHashJoinExecTransformer.scala @@ -29,7 +29,6 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils import org.apache.spark.sql.execution.joins.{BaseJoinExec, HashJoin, ShuffledJoin} import org.apache.spark.sql.execution.joins.HashedRelationInfo import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/CoalesceBatchesExec.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/CoalesceBatchesExec.scala index 476c48e6b62b..b1ba0bdf3470 100644 --- a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/CoalesceBatchesExec.scala +++ b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/CoalesceBatchesExec.scala @@ -20,22 +20,20 @@ package com.intel.oap.execution import com.intel.oap.expression.ConverterUtils import com.intel.oap.vectorized.ArrowWritableColumnVector import com.intel.oap.vectorized.CloseableColumnBatchIterator - import org.apache.arrow.vector.util.VectorBatchAppender import org.apache.arrow.memory.{BufferAllocator, RootAllocator} -import org.apache.arrow.vector.types.pojo.Schema; - +import org.apache.arrow.vector.types.pojo.Schema import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.physical.Partitioning +import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils -import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} -import org.apache.spark.sql.types.{StructType, StructField} -import org.apache.spark.sql.util.ArrowUtils; +import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch} +import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.sql.util.ArrowUtils import scala.collection.mutable.ListBuffer import scala.collection.JavaConverters._ diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/ColumnarDataSourceRDD.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/ColumnarDataSourceRDD.scala deleted file mode 100644 index 0187e7577057..000000000000 --- a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/ColumnarDataSourceRDD.scala +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.oap.execution - -import com.intel.oap.GazellePluginConfig -import com.intel.oap.vectorized._ -import org.apache.spark._ -import org.apache.spark.rdd.RDD -import org.apache.spark.util._ -import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory} -import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile} -import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} -import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch} -import org.apache.spark.sql.execution.datasources.v2.VectorizedFilePartitionReaderHandler -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils -import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetPartitionReaderFactory -import org.apache.spark.sql.util.OASPackageBridge._ - -class DataSourceRDDPartition(val index: Int, val inputPartition: InputPartition) - extends Partition - with Serializable - -// TODO: we should have 2 RDDs: an RDD[InternalRow] for row-based scan, an `RDD[ColumnarBatch]` for -// columnar scan. -class ColumnarDataSourceRDD( - sc: SparkContext, - @transient private val inputPartitions: Seq[InputPartition], - partitionReaderFactory: PartitionReaderFactory, - columnarReads: Boolean, - scanTime: SQLMetric, - numInputBatches: SQLMetric, - inputSize: SQLMetric, - tmp_dir: String) - extends RDD[ColumnarBatch](sc, Nil) { - val numaBindingInfo = GazellePluginConfig.getConf.numaBindingInfo - - override protected def getPartitions: Array[Partition] = { - inputPartitions.zipWithIndex.map { - case (inputPartition, index) => new DataSourceRDDPartition(index, inputPartition) - }.toArray - } - - private def castPartition(split: Partition): DataSourceRDDPartition = split match { - case p: DataSourceRDDPartition => p - case _ => throw new SparkException(s"[BUG] Not a DataSourceRDDPartition: $split") - } - - override def compute(split: Partition, context: TaskContext): Iterator[ColumnarBatch] = { - ExecutorManager.tryTaskSet(numaBindingInfo) - val inputPartition = castPartition(split).inputPartition - inputPartition match { - case p: FilePartition => - p.files.foreach { f => inputSize += f.length } - case _ => - } - val reader = if (columnarReads) { - partitionReaderFactory match { - case factory: ParquetPartitionReaderFactory => - VectorizedFilePartitionReaderHandler.get(inputPartition, factory, tmp_dir) - case _ => partitionReaderFactory.createColumnarReader(inputPartition) - } - } else { - partitionReaderFactory.createReader(inputPartition) - } - - val rddId = this - SparkMemoryUtils.addLeakSafeTaskCompletionListener[Unit](_ => reader.close()) - val iter = new Iterator[Any] { - private val inputMetrics = TaskContext.get().taskMetrics().inputMetrics - - private[this] var valuePrepared = false - - override def hasNext: Boolean = { - if (!valuePrepared) { - try { - val beforeScan = System.nanoTime() - valuePrepared = reader.next() - numInputBatches += 1 - scanTime += (System.nanoTime() - beforeScan) / (1000 * 1000) - } catch { - case e: Throwable => - val errmsg = e.getStackTrace.mkString("\n") - logError(s"hasNext got exception: $errmsg") - valuePrepared = false - } - } - valuePrepared - } - - override def next(): Any = { - if (!hasNext) { - throw new java.util.NoSuchElementException("End of stream") - } - valuePrepared = false - val value = reader.get() - val bytes: Long = value match { - case batch: ColumnarBatch => - (0 until batch.numCols()).map { i => - val vector = Option(batch.column(i)) - vector.map { - case av: ArrowWritableColumnVector => - av.getValueVector.getBufferSize.toLong - case _ => 0L - }.sum - }.sum - case _ => 0L - } - inputMetrics.bridgeIncBytesRead(bytes) - value - } - } - val closeableColumnarBatchIterator = new CloseableColumnBatchIterator( - iter.asInstanceOf[Iterator[ColumnarBatch]]) - // TODO: SPARK-25083 remove the type erasure hack in data source scan - new InterruptibleIterator(context, closeableColumnarBatchIterator) - } - - override def getPreferredLocations(split: Partition): Seq[String] = { - castPartition(split).inputPartition.preferredLocations() - } - -} diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/ExpandExecTransformer.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/ExpandExecTransformer.scala index e6c7657b06cc..622af76dfe9f 100644 --- a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/ExpandExecTransformer.scala +++ b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/ExpandExecTransformer.scala @@ -31,7 +31,6 @@ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartit import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.TaskContext -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils import org.apache.spark.sql.types.DecimalType case class ExpandExecTransformer( diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/HashAggregateExecTransformer.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/HashAggregateExecTransformer.scala index a80c1a7d3a54..a322c97b0064 100644 --- a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/HashAggregateExecTransformer.scala +++ b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/HashAggregateExecTransformer.scala @@ -48,7 +48,6 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils._ import org.apache.spark.sql.catalyst.util.truncatedString import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.aggregate._ -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.execution.vectorized.MutableColumnarRow import org.apache.spark.sql.internal.SQLConf diff --git a/arrow-data-source/common/src/main/scala/com/intel/oap/sql/execution/RowToArrowColumnarExec.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/RowToArrowColumnarExec.scala similarity index 99% rename from arrow-data-source/common/src/main/scala/com/intel/oap/sql/execution/RowToArrowColumnarExec.scala rename to native-sql-engine/core/src/main/scala/com/intel/oap/execution/RowToArrowColumnarExec.scala index b9b58fcb9c90..9bd55f729866 100644 --- a/arrow-data-source/common/src/main/scala/com/intel/oap/sql/execution/RowToArrowColumnarExec.scala +++ b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/RowToArrowColumnarExec.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package com.intel.oap.sql.execution +package com.intel.oap.execution import java.util.concurrent.TimeUnit._ diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/ShuffledHashJoinExecTransformer.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/ShuffledHashJoinExecTransformer.scala index 5f3ee46e83c9..af4c41f4a8f8 100644 --- a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/ShuffledHashJoinExecTransformer.scala +++ b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/ShuffledHashJoinExecTransformer.scala @@ -49,7 +49,6 @@ import org.apache.arrow.memory.ArrowBuf import com.google.common.collect.Lists import com.intel.oap.expression._ import com.intel.oap.vectorized.ExpressionEvaluator -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils import org.apache.spark.sql.execution.joins.ShuffledHashJoinExec import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide} import org.apache.spark.sql.execution.joins.{HashJoin,ShuffledJoin,BaseJoinExec} diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/SortExecTransformer.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/SortExecTransformer.scala index 9b2fb8cd5882..2932fd0a6cdf 100644 --- a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/SortExecTransformer.scala +++ b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/SortExecTransformer.scala @@ -39,7 +39,6 @@ import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReference import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.rdd.RDD -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils import org.apache.spark.util.{ExecutorManager, UserAddedJarUtils, Utils} import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch} diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/SortMergeJoinExecTransformer.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/SortMergeJoinExecTransformer.scala index d04d81f1e0e3..9aa5e35130f6 100644 --- a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/SortMergeJoinExecTransformer.scala +++ b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/SortMergeJoinExecTransformer.scala @@ -50,7 +50,6 @@ import org.apache.arrow.memory.ArrowBuf import com.google.common.collect.Lists import com.intel.oap.expression._ import com.intel.oap.vectorized.ExpressionEvaluator -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils import org.apache.spark.sql.execution.joins._ import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide} import org.apache.spark.sql.types.DecimalType diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/WholestageColumnarRDD.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/WholestageColumnarRDD.scala index 0af1de815677..461d35edbd13 100644 --- a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/WholestageColumnarRDD.scala +++ b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/WholestageColumnarRDD.scala @@ -32,7 +32,6 @@ import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFacto import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.datasources.FilePartition import org.apache.spark.sql.execution.datasources.v2.VectorizedFilePartitionReaderHandler -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetPartitionReaderFactory import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.util.ArrowUtils diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/WindowExecTransformer.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/WindowExecTransformer.scala index 3430a2480349..f9e3028a573b 100644 --- a/native-sql-engine/core/src/main/scala/com/intel/oap/execution/WindowExecTransformer.scala +++ b/native-sql-engine/core/src/main/scala/com/intel/oap/execution/WindowExecTransformer.scala @@ -36,7 +36,6 @@ import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistrib import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor} import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.{SortExec, SparkPlan} -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.execution.window.WindowExecBase import org.apache.spark.sql.internal.SQLConf @@ -49,7 +48,6 @@ import scala.collection.immutable.Stream.Empty import scala.collection.mutable.ListBuffer import scala.util.Random -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkSchemaUtils import util.control.Breaks._ case class WindowExecTransformer(windowExpression: Seq[NamedExpression], diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/expression/CodeGeneration.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/expression/CodeGeneration.scala index d68a40e3ccf9..bb5882b3fbc0 100644 --- a/native-sql-engine/core/src/main/scala/com/intel/oap/expression/CodeGeneration.scala +++ b/native-sql-engine/core/src/main/scala/com/intel/oap/expression/CodeGeneration.scala @@ -21,7 +21,6 @@ import org.apache.arrow.vector.Float4Vector import org.apache.arrow.vector.IntVector import org.apache.arrow.vector.types.{DateUnit, FloatingPointPrecision, TimeUnit} import org.apache.arrow.vector.types.pojo.ArrowType - import org.apache.spark.sql.execution.datasources.v2.arrow.SparkSchemaUtils import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/expression/ConverterUtils.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/expression/ConverterUtils.scala index eaa7606ba9a6..fca4342fa30e 100644 --- a/native-sql-engine/core/src/main/scala/com/intel/oap/expression/ConverterUtils.scala +++ b/native-sql-engine/core/src/main/scala/com/intel/oap/expression/ConverterUtils.scala @@ -62,8 +62,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeID import org.apache.arrow.vector.types.{DateUnit, FloatingPointPrecision} import org.apache.spark.sql.catalyst.util.DateTimeConstants import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_SECOND -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkSchemaUtils -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkVectorUtils +import org.apache.spark.sql.execution.datasources.v2.arrow.{SparkSchemaUtils, SparkVectorUtils} object ConverterUtils extends Logging { def calcuateEstimatedSize(columnarBatch: ColumnarBatch): Long = { diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/expression/UnaryOperatorTransformer.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/expression/UnaryOperatorTransformer.scala index f092d801ad52..3411966013c0 100644 --- a/native-sql-engine/core/src/main/scala/com/intel/oap/expression/UnaryOperatorTransformer.scala +++ b/native-sql-engine/core/src/main/scala/com/intel/oap/expression/UnaryOperatorTransformer.scala @@ -36,7 +36,6 @@ import com.intel.oap.substrait.`type`.TypeBuiler import com.intel.oap.substrait.expression.{ExpressionBuilder, ExpressionNode} import org.apache.arrow.vector.types.TimeUnit import org.apache.spark.sql.catalyst.util.DateTimeConstants -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkSchemaUtils /** * A version of add that supports columnar processing for longs. diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/extension/ColumnarOverrides.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/extension/ColumnarOverrides.scala index 557c987906c5..41661762e3b1 100644 --- a/native-sql-engine/core/src/main/scala/com/intel/oap/extension/ColumnarOverrides.scala +++ b/native-sql-engine/core/src/main/scala/com/intel/oap/extension/ColumnarOverrides.scala @@ -19,7 +19,6 @@ package com.intel.oap import com.intel.oap.execution._ import com.intel.oap.extension.columnar.{RowGuard, TransformGuardRule} -import com.intel.oap.sql.execution.RowToArrowColumnarExec import org.apache.spark.internal.config._ import org.apache.spark.internal.Logging import org.apache.spark.sql.{SparkSession, SparkSessionExtensions} diff --git a/arrow-data-source/common/src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkMemoryUtils.scala b/native-sql-engine/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkMemoryUtils.scala similarity index 98% rename from arrow-data-source/common/src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkMemoryUtils.scala rename to native-sql-engine/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkMemoryUtils.scala index a5067098eec1..a6dd4dc707e2 100644 --- a/arrow-data-source/common/src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkMemoryUtils.scala +++ b/native-sql-engine/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkMemoryUtils.scala @@ -15,23 +15,23 @@ * limitations under the License. */ + package org.apache.spark.sql.execution.datasources.v2.arrow import java.io.PrintWriter import java.util import java.util.UUID -import scala.collection.JavaConverters._ - -import com.intel.oap.spark.sql.execution.datasources.v2.arrow._ +import com.intel.oap.spark.sql.execution.datasources.v2.arrow.{NativeSQLMemoryConsumer, NativeSQLMemoryMetrics, SparkManagedAllocationListener, SparkManagedReservationListener, Spiller} import com.sun.xml.internal.messaging.saaj.util.ByteOutputStream + +import scala.collection.JavaConverters._ import org.apache.arrow.dataset.jni.NativeMemoryPool import org.apache.arrow.memory.AllocationListener import org.apache.arrow.memory.BufferAllocator import org.apache.arrow.memory.MemoryChunkCleaner import org.apache.arrow.memory.MemoryChunkManager import org.apache.arrow.memory.RootAllocator - import org.apache.spark.SparkEnv import org.apache.spark.TaskContext import org.apache.spark.internal.Logging diff --git a/arrow-data-source/common/src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkSchemaUtils.scala b/native-sql-engine/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkSchemaUtils.scala similarity index 100% rename from arrow-data-source/common/src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkSchemaUtils.scala rename to native-sql-engine/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkSchemaUtils.scala diff --git a/arrow-data-source/common/src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkVectorUtils.scala b/native-sql-engine/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkVectorUtils.scala similarity index 100% rename from arrow-data-source/common/src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkVectorUtils.scala rename to native-sql-engine/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkVectorUtils.scala diff --git a/native-sql-engine/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExecTransformer.scala b/native-sql-engine/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExecTransformer.scala index 19e3723ea0c9..d9bc26df0179 100644 --- a/native-sql-engine/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExecTransformer.scala +++ b/native-sql-engine/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExecTransformer.scala @@ -36,7 +36,6 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.execution.python.EvalPythonExec -import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnVector, ColumnarBatch} import org.apache.spark.sql.types.{DataType, StructField, StructType} diff --git a/pom.xml b/pom.xml index 870a6e5f6bbb..d42eb1f6fe27 100644 --- a/pom.xml +++ b/pom.xml @@ -30,7 +30,6 @@ - arrow-data-source native-sql-engine/core shims From 95834121a3798bbcd60941de8cf141383e5a1166 Mon Sep 17 00:00:00 2001 From: Rui Mo Date: Wed, 15 Dec 2021 19:47:47 +0800 Subject: [PATCH 2/2] remove the dependency on log4j --- pom.xml | 8 ++++++++ shims/pom.xml | 22 +++++----------------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/pom.xml b/pom.xml index d42eb1f6fe27..95692576b804 100644 --- a/pom.xml +++ b/pom.xml @@ -195,6 +195,14 @@ org.apache.curator curator-recipes + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + diff --git a/shims/pom.xml b/shims/pom.xml index fd185052c3f1..aa1ff1029abe 100644 --- a/shims/pom.xml +++ b/shims/pom.xml @@ -44,23 +44,11 @@ - org.slf4j - slf4j-log4j12 - 1.7.30 - test - - - log4j - log4j - 1.2.17 - test - - - org.apache.hadoop - hadoop-client - ${hadoop.version} - test - + org.apache.hadoop + hadoop-client + ${hadoop.version} + test +