support gpu transform (#19)

wbo4958 · Jun 26, 2024 · f1df971 · f1df971
1 parent 588e577
commit f1df971
Show file tree

Hide file tree

Showing 16 changed files with 316 additions and 127 deletions.
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/GpuColumnBatch.java b/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/GpuColumnBatch.java
@@ -43,15 +43,15 @@ public void close() {
     }
   }
 
-  public Table slice(int index) {
+  public Table select(int index) {
     if (index < 0) {
       return null;
     }
-    return slice(Arrays.asList(index));
+    return select(Arrays.asList(index));
   }
 
   /** Slice the columns indicated by indices into a Table*/
-  public Table slice(List<Integer> indices) {
+  public Table select(List<Integer> indices) {
     if (indices == null || indices.size() == 0) {
       return null;
     }

diff --git a/...s/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala b/...s/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala
@@ -17,16 +17,22 @@
 package ml.dmlc.xgboost4j.scala.spark
 
 import scala.collection.mutable.ArrayBuffer
-import scala.jdk.CollectionConverters.seqAsJavaListConverter
+import scala.jdk.CollectionConverters.{asScalaIteratorConverter, seqAsJavaListConverter}
 
 import ai.rapids.cudf.Table
-import com.nvidia.spark.rapids.ColumnarRdd
+import com.nvidia.spark.rapids.{ColumnarRdd, GpuColumnVectorUtils}
+import org.apache.commons.logging.LogFactory
+import org.apache.spark.TaskContext
+import org.apache.spark.ml.functions.array_to_vector
 import org.apache.spark.ml.param.Param
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Column, Dataset}
+import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
+import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
+import org.apache.spark.sql.vectorized.ColumnarBatch
 
 import ml.dmlc.xgboost4j.java.{CudfColumnBatch, GpuColumnBatch}
-import ml.dmlc.xgboost4j.scala.QuantileDMatrix
+import ml.dmlc.xgboost4j.scala.{DMatrix, QuantileDMatrix}
 import ml.dmlc.xgboost4j.scala.spark.params.HasGroupCol
 
 /**
@@ -35,6 +41,8 @@ import ml.dmlc.xgboost4j.scala.spark.params.HasGroupCol
  */
 class GpuXGBoostPlugin extends XGBoostPlugin {
 
+  private val logger = LogFactory.getLog("XGBoostSparkGpuPlugin")
+
   /**
    * Whether the plugin is enabled or not, if not enabled, fallback
    * to the regular CPU pipeline
@@ -115,10 +123,10 @@ class GpuXGBoostPlugin extends XGBoostPlugin {
       val colBatchIter = iter.map { table =>
         withResource(new GpuColumnBatch(table, null)) { batch =>
           new CudfColumnBatch(
-            batch.slice(indices.featureIds.get.map(Integer.valueOf).asJava),
-            batch.slice(indices.labelId),
-            batch.slice(indices.weightId.getOrElse(-1)),
-            batch.slice(indices.marginId.getOrElse(-1)));
+            batch.select(indices.featureIds.get.map(Integer.valueOf).asJava),
+            batch.select(indices.labelId),
+            batch.select(indices.weightId.getOrElse(-1)),
+            batch.select(indices.marginId.getOrElse(-1)));
         }
       }
       new QuantileDMatrix(colBatchIter, missing, maxBin, nthread)
@@ -150,4 +158,124 @@ class GpuXGBoostPlugin extends XGBoostPlugin {
     }
   }
 
+
+  override def transform[M <: XGBoostModel[M]](model: XGBoostModel[M],
+                                               dataset: Dataset[_]): DataFrame = {
+    val sc = dataset.sparkSession.sparkContext
+
+    val (transformedSchema, pred) = model.preprocess(dataset)
+    val bBooster = sc.broadcast(model.nativeBooster)
+    val bOriginalSchema = sc.broadcast(dataset.schema)
+
+    val featureIds = model.getFeaturesCols.distinct.map(dataset.schema.fieldIndex).toList
+    val isLocal = sc.isLocal
+    val missing = model.getMissing
+    val nThread = model.getNthread
+
+    val rdd = ColumnarRdd(dataset.asInstanceOf[DataFrame]).mapPartitions { tableIters =>
+      // booster is visible for all spark tasks in the same executor
+      val booster = bBooster.value
+      val originalSchema = bOriginalSchema.value
+
+      // UnsafeProjection is not serializable so do it on the executor side
+      val toUnsafe = UnsafeProjection.create(originalSchema)
+
+      synchronized {
+        val device = booster.getAttr("device")
+        if (device != null && device.trim.isEmpty) {
+          booster.setAttr("device", "cuda")
+          val gpuId = if (!isLocal) XGBoost.getGPUAddrFromResources else 0
+          booster.setParam("device", s"cuda:$gpuId")
+          logger.info("GPU transform on GPU device: " + gpuId)
+        }
+      }
+
+      // Iterator on Row
+      new Iterator[Row] {
+        // Convert InternalRow to Row
+        private val converter: InternalRow => Row = CatalystTypeConverters
+          .createToScalaConverter(originalSchema)
+          .asInstanceOf[InternalRow => Row]
+
+        // GPU batches read in must be closed by the receiver
+        @transient var currentBatch: ColumnarBatch = null
+
+        // Iterator on Row
+        var iter: Iterator[Row] = null
+
+        TaskContext.get().addTaskCompletionListener[Unit](_ => {
+          closeCurrentBatch() // close the last ColumnarBatch
+        })
+
+        private def closeCurrentBatch(): Unit = {
+          if (currentBatch != null) {
+            currentBatch.close()
+            currentBatch = null
+          }
+        }
+
+        def loadNextBatch(): Unit = {
+          closeCurrentBatch()
+          if (tableIters.hasNext) {
+            val dataTypes = originalSchema.fields.map(x => x.dataType)
+            iter = withResource(tableIters.next()) { table =>
+              val gpuColumnBatch = new GpuColumnBatch(table, originalSchema)
+              // Create DMatrix
+              val featureTable = gpuColumnBatch.select(featureIds.map(Integer.valueOf).asJava)
+              if (featureTable == null) {
+                throw new RuntimeException("Something wrong for feature indices")
+              }
+              try {
+                val cudfColumnBatch = new CudfColumnBatch(featureTable, null, null, null)
+                val dm = new DMatrix(cudfColumnBatch, missing, nThread)
+                if (dm == null) {
+                  Iterator.empty
+                } else {
+                  try {
+                    currentBatch = new ColumnarBatch(
+                      GpuColumnVectorUtils.extractHostColumns(table, dataTypes),
+                      table.getRowCount().toInt)
+                    val rowIterator = currentBatch.rowIterator().asScala.map(toUnsafe)
+                      .map(converter(_))
+                    model.predictInternal(booster, dm, pred, rowIterator).toIterator
+                  } finally {
+                    dm.delete()
+                  }
+                }
+              } finally {
+                featureTable.close()
+              }
+            }
+          } else {
+            iter = null
+          }
+        }
+
+        override def hasNext: Boolean = {
+          val itHasNext = iter != null && iter.hasNext
+          if (!itHasNext) { // Don't have extra Row for current ColumnarBatch
+            loadNextBatch()
+            iter != null && iter.hasNext
+          } else {
+            itHasNext
+          }
+        }
+
+        override def next(): Row = {
+          if (iter == null || !iter.hasNext) {
+            loadNextBatch()
+          }
+          if (iter == null) {
+            throw new NoSuchElementException()
+          }
+          iter.next()
+        }
+      }
+    }
+    bBooster.unpersist(false)
+    bOriginalSchema.unpersist(false)
+
+    val output = dataset.sparkSession.createDataFrame(rdd, transformedSchema)
+    model.postTransform(output, pred).toDF()
+  }
 }
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/resources/binary.test.parquet b/jvm-packages/xgboost4j-spark-gpu/src/test/resources/binary.test.parquet
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/resources/binary.train.parquet b/jvm-packages/xgboost4j-spark-gpu/src/test/resources/binary.train.parquet
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/resources/multiclass.test.parquet b/jvm-packages/xgboost4j-spark-gpu/src/test/resources/multiclass.test.parquet
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/resources/multiclass.train.parquet b/jvm-packages/xgboost4j-spark-gpu/src/test/resources/multiclass.train.parquet
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/resources/regression.test.parquet b/jvm-packages/xgboost4j-spark-gpu/src/test/resources/regression.test.parquet
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/resources/regression.train.parquet b/jvm-packages/xgboost4j-spark-gpu/src/test/resources/regression.train.parquet
diff --git a/...kages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuTestSuite.scala b/...kages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuTestSuite.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2021-2023 by Contributors
+ Copyright (c) 2021-2024 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.

diff --git a/...oost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala b/...oost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala
@@ -1,14 +1,33 @@
+/*
+ Copyright (c) 2024 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
 package ml.dmlc.xgboost4j.scala.spark
 
+import java.io.File
+
 import scala.collection.mutable.ArrayBuffer
 
+import ai.rapids.cudf.{CSVOptions, DType, Schema, Table}
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.types.{FloatType, StructField, StructType}
 
 import ml.dmlc.xgboost4j.scala.rapids.spark.GpuTestSuite
 
 class GpuXGBoostPluginSuite extends GpuTestSuite {
 
-
   test("isEnabled") {
     def checkIsEnabled(spark: SparkSession, expected: Boolean): Unit = {
       import spark.implicits._
@@ -37,7 +56,7 @@ class GpuXGBoostPluginSuite extends GpuTestSuite {
         (2.0f, 3.0f, 2.0f, 3.0f, 1.0f, 0.1f),
         (3.0f, 4.0f, 5.0f, 6.0f, 0.0f, 0.1f),
         (4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 0.1f),
-        (5.0f, 6.0f, 7.0f, 8.0f, 0.0f, 0.1f),
+        (5.0f, 6.0f, 7.0f, 8.0f, 0.0f, 0.1f)
       ).toDF("c1", "c2", "weight", "margin", "label", "other")
       val classifier = new XGBoostClassifier()
 
@@ -64,7 +83,7 @@ class GpuXGBoostPluginSuite extends GpuTestSuite {
         (2.0f, 3.0f, 2.0f, 3.0f, 1.0f, 0.1f),
         (3.0f, 4.0f, 5.0f, 6.0f, 0.0f, 0.1f),
         (4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 0.1f),
-        (5.0f, 6.0f, 7.0f, 8.0f, 0.0f, 0.1f),
+        (5.0f, 6.0f, 7.0f, 8.0f, 0.0f, 0.1f)
       ).toDF("c1", "c2", "weight", "margin", "label", "other")
         .repartition(5)
 
@@ -114,7 +133,7 @@ class GpuXGBoostPluginSuite extends GpuTestSuite {
             (2.0f, 3.0f, 2.0f, 3.0f, 1.0f, 0.1f),
             (3.0f, data, 5.0f, 6.0f, 0.0f, 0.1f),
             (4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 0.1f),
-            (5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 0.1f),
+            (5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 0.1f)
           ).toDF("c1", "c2", "weight", "margin", "label", "other")
 
           val features = Array("c1", "c2")
@@ -168,7 +187,7 @@ class GpuXGBoostPluginSuite extends GpuTestSuite {
 
       val train = Seq(
         (1.0f, 2.0f, 1.0f, 2.0f, 0.0f, 0.0f),
-        (2.0f, 3.0f, 2.0f, 3.0f, 1.0f, 0.1f),
+        (2.0f, 3.0f, 2.0f, 3.0f, 1.0f, 0.1f)
       ).toDF("c1", "c2", "weight", "margin", "label", "other")
 
       // dataPoint -> (missing, rowNum, nonMissing)
@@ -179,7 +198,7 @@ class GpuXGBoostPluginSuite extends GpuTestSuite {
             (2.0f, 3.0f, 2.0f, 3.0f, 1.0f, 0.1f),
             (3.0f, data, 5.0f, 6.0f, 0.0f, 0.1f),
             (4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 0.1f),
-            (5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 0.1f),
+            (5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 0.1f)
           ).toDF("c1", "c2", "weight", "margin", "label", "other")
 
           val features = Array("c1", "c2")
@@ -226,4 +245,19 @@ class GpuXGBoostPluginSuite extends GpuTestSuite {
       }
     }
   }
+
+
+  test("XGBoost-Spark should match xgboost4j") {
+    withGpuSparkSession() { spark =>
+
+      val cols = Array("c0", "c1", "c2", "c3", "c4", "c5")
+      val label = "label"
+
+      val table = Table.readParquet(new File(getResourcePath("/binary.train.parquet")))
+      val df = spark.read.parquet(getResourcePath("/binary.train.parquet"))
+
+
+      df.show()
+    }
+  }
 }
diff --git a/...ackages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XXXXXSuite.scala b/...ackages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XXXXXSuite.scala
@@ -16,7 +16,6 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
-import org.apache.spark.sql.functions.lit
 import org.scalatest.funsuite.AnyFunSuite
 
 import ml.dmlc.xgboost4j.scala.rapids.spark.GpuTestSuite
@@ -41,53 +40,55 @@ class XXXXXSuite extends AnyFunSuite with GpuTestSuite {
 
       var Array(trainDf, validationDf) = df.randomSplit(Array(0.8, 0.2), seed = 1)
 
-//      trainDf = trainDf.withColumn("validation", lit(false))
-//      validationDf = validationDf.withColumn("validationDf", lit(true))
+      //      trainDf = trainDf.withColumn("validation", lit(false))
+      //      validationDf = validationDf.withColumn("validationDf", lit(true))
 
-//      df = trainDf.union(validationDf)
-//
-//      // Assemble the feature columns into a single vector column
-//      val assembler = new VectorAssembler()
-//        .setInputCols(features)
-//        .setOutputCol("features")
-//      val dataset = assembler.transform(df)
+      //      df = trainDf.union(validationDf)
+      //
+      //      // Assemble the feature columns into a single vector column
+      //      val assembler = new VectorAssembler()
+      //        .setInputCols(features)
+      //        .setOutputCol("features")
+      //      val dataset = assembler.transform(df)
 
       //    val arrayInput = df.select(array(features.map(col(_)): _*).as("features"),
       //      col("label"), col("base_margin"))
 
       val est = new XGBoostClassifier()
         .setNumWorkers(1)
-        .setNumRound(2)
-        .setMaxDepth(3)
+        .setNumRound(100)
+        //        .setMaxDepth(3)
         //      .setWeightCol("weight")
         //      .setBaseMarginCol("base_margin")
         .setFeaturesCol(features)
         .setLabelCol(labelCol)
+        .setLeafPredictionCol("leaf")
+        .setContribPredictionCol("contrib")
         .setDevice("cuda")
-        .setEvalDataset(validationDf)
-//        .setValidationIndicatorCol("validation")
-        //      .setPredictionCol("")
-        .setRawPredictionCol("")
-        .setProbabilityCol("xxxx")
+      //        .setEvalDataset(validationDf)
+      //        .setValidationIndicatorCol("validation")
+      //      .setPredictionCol("")
+      //        .setRawPredictionCol("")
+      //        .setProbabilityCol("xxxx")
       //      .setContribPredictionCol("contrb")
       //      .setLeafPredictionCol("leaf")
       //    val est = new XGBoostClassifier().setLabelCol(labelCol)
       //    est.fit(arrayInput)
-      est.write.overwrite().save("/tmp/abcdef")
-      val loadedEst = XGBoostClassifier.load("/tmp/abcdef")
-      println(loadedEst.getNumRound)
-      println(loadedEst.getMaxDepth)
+      //      est.write.overwrite().save("/tmp/abcdef")
+      //      val loadedEst = XGBoostClassifier.load("/tmp/abcdef")
+      //      println(loadedEst.getNumRound)
+      //      println(loadedEst.getMaxDepth)
 
       val model = est.fit(trainDf)
-      println("-----------------------")
-      println(model.getNumRound)
-      println(model.getMaxDepth)
-
-//      model.write.overwrite().save("/tmp/model/")
-//      val loadedModel = XGBoostClassificationModel.load("/tmp/model")
-//      println(loadedModel.getNumRound)
-//      println(loadedModel.getMaxDepth)
-//      model.transform(df).drop(features: _*).show(150, false)
+
+      val out = model.transform(df)
+      out.printSchema()
+      out.show(150, false)
+      //      model.write.overwrite().save("/tmp/model/")
+      //      val loadedModel = XGBoostClassificationModel.load("/tmp/model")
+      //      println(loadedModel.getNumRound)
+      //      println(loadedModel.getMaxDepth)
+      //      model.transform(df).drop(features: _*).show(150, false)
     }
 
   }