From b1d0ef737a0911b95a11fd3b04cc8e438ea69501 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Wed, 17 Jul 2024 14:51:38 +0800 Subject: [PATCH] Gpu: Support rank and regressor (#10560) --- .../dmlc/xgboost4j/java/CudfColumnBatch.java | 16 +- .../xgboost4j/scala/QuantileDMatrix.scala | 11 +- .../scala/spark/GpuXGBoostPlugin.scala | 18 +- .../ml/dmlc/xgboost4j/java/BoosterTest.java | 53 +- .../ml/dmlc/xgboost4j/java/DMatrixTest.java | 99 ++- .../src/test/resources/binary.test.parquet | Bin 5417 -> 0 bytes .../src/test/resources/binary.train.parquet | Bin 5511 -> 0 bytes .../test/resources/multiclass.test.parquet | Bin 5374 -> 0 bytes .../test/resources/multiclass.train.parquet | Bin 5668 -> 0 bytes .../test/resources/regression.test.parquet | Bin 6010 -> 0 bytes .../test/resources/regression.train.parquet | Bin 6008 -> 0 bytes .../scala/QuantileDMatrixSuite.scala | 50 +- .../scala/spark/GpuXGBoostPluginSuite.scala | 376 ++++++++- .../xgboost4j/scala/spark/TrainTestData.scala | 86 ++ .../xgboost4j/scala/spark/XXXXXSuite.scala | 95 --- .../ml/dmlc/xgboost4j/scala/spark/Utils.scala | 12 +- .../dmlc/xgboost4j/scala/spark/PerTest.scala | 11 +- .../scala/spark/XGBoostClassifierSuite.scala | 7 + .../scala/spark/XGBoostEstimatorSuite.scala | 732 +++++++++--------- .../java/ml/dmlc/xgboost4j/java/DMatrix.java | 265 ++++--- .../ml/dmlc/xgboost4j/scala/DMatrix.scala | 63 +- .../xgboost4j/src/native/xgboost4j-gpu.cu | 21 +- 22 files changed, 1191 insertions(+), 724 deletions(-) delete mode 100644 jvm-packages/xgboost4j-spark-gpu/src/test/resources/binary.test.parquet delete mode 100644 jvm-packages/xgboost4j-spark-gpu/src/test/resources/binary.train.parquet delete mode 100644 jvm-packages/xgboost4j-spark-gpu/src/test/resources/multiclass.test.parquet delete mode 100644 jvm-packages/xgboost4j-spark-gpu/src/test/resources/multiclass.train.parquet delete mode 100644 jvm-packages/xgboost4j-spark-gpu/src/test/resources/regression.test.parquet delete mode 100644 jvm-packages/xgboost4j-spark-gpu/src/test/resources/regression.train.parquet create mode 100644 jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala delete mode 100644 jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XXXXXSuite.scala diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/CudfColumnBatch.java b/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/CudfColumnBatch.java index 90b394e5a1c5..2f1870c580be 100644 --- a/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/CudfColumnBatch.java +++ b/jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/CudfColumnBatch.java @@ -39,18 +39,22 @@ public class CudfColumnBatch extends ColumnBatch { private final Table weightTable; @JsonIgnore private final Table baseMarginTable; + @JsonIgnore + private final Table qidTable; private List features; private List label; private List weight; private List baseMargin; + private List qid; public CudfColumnBatch(Table featureTable, Table labelTable, Table weightTable, - Table baseMarginTable) { + Table baseMarginTable, Table qidTable) { this.featureTable = featureTable; this.labelTable = labelTable; this.weightTable = weightTable; this.baseMarginTable = baseMarginTable; + this.qidTable = qidTable; features = initializeCudfColumns(featureTable); if (labelTable != null) { @@ -66,6 +70,11 @@ public CudfColumnBatch(Table featureTable, Table labelTable, Table weightTable, if (baseMarginTable != null) { baseMargin = initializeCudfColumns(baseMarginTable); } + + if (qidTable != null) { + qid = initializeCudfColumns(qidTable); + } + } private List initializeCudfColumns(Table table) { @@ -93,6 +102,10 @@ public List getBaseMargin() { return baseMargin; } + public List getQid() { + return qid; + } + public String toJson() { ObjectMapper mapper = new ObjectMapper(); mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); @@ -119,5 +132,6 @@ public void close() { if (labelTable != null) labelTable.close(); if (weightTable != null) weightTable.close(); if (baseMarginTable != null) baseMarginTable.close(); + if (qidTable != null) qidTable.close(); } } diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrix.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrix.scala index 93a773829f43..73abf6df9d68 100644 --- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrix.scala +++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrix.scala @@ -24,9 +24,9 @@ class QuantileDMatrix private[scala]( private[scala] override val jDMatrix: JQuantileDMatrix) extends DMatrix(jDMatrix) { /** - * Create QuantileDMatrix from iterator based on the cuda array interface + * Create QuantileDMatrix from iterator based on the array interface * - * @param iter the XGBoost ColumnBatch batch to provide the corresponding cuda array interface + * @param iter the XGBoost ColumnBatch batch to provide the corresponding array interface * @param missing the missing value * @param maxBin the max bin * @param nthread the parallelism @@ -84,7 +84,7 @@ class QuantileDMatrix private[scala]( throw new XGBoostError("QuantileDMatrix does not support setGroup.") /** - * Set label of DMatrix from cuda array interface + * Set label of DMatrix from array interface */ @throws(classOf[XGBoostError]) override def setLabel(column: Column): Unit = @@ -104,4 +104,9 @@ class QuantileDMatrix private[scala]( override def setBaseMargin(column: Column): Unit = throw new XGBoostError("QuantileDMatrix does not support setBaseMargin.") + @throws(classOf[XGBoostError]) + override def setQueryId(column: Column): Unit = { + throw new XGBoostError("QuantileDMatrix does not support setQueryId.") + } + } diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala index 336f75eeffc8..4060697424c0 100644 --- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala +++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPlugin.scala @@ -33,6 +33,7 @@ import org.apache.spark.sql.vectorized.ColumnarBatch import ml.dmlc.xgboost4j.java.CudfColumnBatch import ml.dmlc.xgboost4j.scala.{DMatrix, QuantileDMatrix} +import ml.dmlc.xgboost4j.scala.spark.Utils.withResource import ml.dmlc.xgboost4j.scala.spark.params.HasGroupCol /** @@ -119,7 +120,7 @@ class GpuXGBoostPlugin extends XGBoostPlugin { val nthread = estimator.getNthread val missing = estimator.getMissing - /** build QuantilDMatrix on the executor side */ + /** build QuantileDMatrix on the executor side */ def buildQuantileDMatrix(iter: Iterator[Table]): QuantileDMatrix = { val colBatchIter = iter.map { table => withResource(new GpuColumnBatch(table)) { batch => @@ -127,7 +128,8 @@ class GpuXGBoostPlugin extends XGBoostPlugin { batch.select(indices.featureIds.get), batch.select(indices.labelId), batch.select(indices.weightId.getOrElse(-1)), - batch.select(indices.marginId.getOrElse(-1))); + batch.select(indices.marginId.getOrElse(-1)), + batch.select(indices.groupId.getOrElse(-1))); } } new QuantileDMatrix(colBatchIter, missing, maxBin, nthread) @@ -150,16 +152,6 @@ class GpuXGBoostPlugin extends XGBoostPlugin { ) } - /** Executes the provided code block and then closes the resource */ - def withResource[T <: AutoCloseable, V](r: T)(block: T => V): V = { - try { - block(r) - } finally { - r.close() - } - } - - override def transform[M <: XGBoostModel[M]](model: XGBoostModel[M], dataset: Dataset[_]): DataFrame = { val sc = dataset.sparkSession.sparkContext @@ -226,7 +218,7 @@ class GpuXGBoostPlugin extends XGBoostPlugin { throw new RuntimeException("Something wrong for feature indices") } try { - val cudfColumnBatch = new CudfColumnBatch(featureTable, null, null, null) + val cudfColumnBatch = new CudfColumnBatch(featureTable, null, null, null, null) val dm = new DMatrix(cudfColumnBatch, missing, nThread) if (dm == null) { Iterator.empty diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/java/ml/dmlc/xgboost4j/java/BoosterTest.java b/jvm-packages/xgboost4j-spark-gpu/src/test/java/ml/dmlc/xgboost4j/java/BoosterTest.java index 50d25765edb2..c1283c8b9076 100644 --- a/jvm-packages/xgboost4j-spark-gpu/src/test/java/ml/dmlc/xgboost4j/java/BoosterTest.java +++ b/jvm-packages/xgboost4j-spark-gpu/src/test/java/ml/dmlc/xgboost4j/java/BoosterTest.java @@ -22,8 +22,7 @@ import java.util.List; import java.util.Map; -import ai.rapids.cudf.ColumnVector; -import ai.rapids.cudf.Table; +import ai.rapids.cudf.*; import junit.framework.TestCase; import org.junit.Test; @@ -36,7 +35,27 @@ public class BoosterTest { @Test public void testBooster() throws XGBoostError { - String resourcePath = getClass().getResource("/binary.train.parquet").getFile(); + String trainingDataPath = "../../demo/data/veterans_lung_cancer.csv"; + Schema schema = Schema.builder() + .column(DType.FLOAT32, "A") + .column(DType.FLOAT32, "B") + .column(DType.FLOAT32, "C") + .column(DType.FLOAT32, "D") + + .column(DType.FLOAT32, "E") + .column(DType.FLOAT32, "F") + .column(DType.FLOAT32, "G") + .column(DType.FLOAT32, "H") + + .column(DType.FLOAT32, "I") + .column(DType.FLOAT32, "J") + .column(DType.FLOAT32, "K") + .column(DType.FLOAT32, "L") + + .column(DType.FLOAT32, "label") + .build(); + CSVOptions opts = CSVOptions.builder() + .hasHeader().build(); int maxBin = 16; int round = 10; @@ -53,32 +72,33 @@ public void testBooster() throws XGBoostError { } }; - try (Table table = Table.readParquet(new File(resourcePath))) { - ColumnVector[] features = new ColumnVector[6]; - for (int i = 0; i < 6; i++) { - features[i] = table.getColumn(i); + try (Table tmpTable = Table.readCSV(schema, opts, new File(trainingDataPath))) { + ColumnVector[] df = new ColumnVector[10]; + // exclude the first two columns, they are label bounds and contain inf. + for (int i = 2; i < 12; ++i) { + df[i - 2] = tmpTable.getColumn(i); } - - try (Table X = new Table(features)) { + try (Table X = new Table(df);) { ColumnVector[] labels = new ColumnVector[1]; - labels[0] = table.getColumn(6); + labels[0] = tmpTable.getColumn(12); - try (Table y = new Table(labels)) { + try (Table y = new Table(labels);) { - CudfColumnBatch batch = new CudfColumnBatch(X, y, null, null); - CudfColumn labelColumn = CudfColumn.from(y.getColumn(0)); + CudfColumnBatch batch = new CudfColumnBatch(X, y, null, null, null); + CudfColumn labelColumn = CudfColumn.from(tmpTable.getColumn(12)); - // train XGBoost Booster base on DMatrix + //set watchList HashMap watches = new HashMap<>(); + DMatrix dMatrix1 = new DMatrix(batch, Float.NaN, 1); dMatrix1.setLabel(labelColumn); watches.put("train", dMatrix1); Booster model1 = XGBoost.train(dMatrix1, paramMap, round, watches, null, null); - // train XGBoost Booster base on QuantileDMatrix List tables = new LinkedList<>(); tables.add(batch); DMatrix incrementalDMatrix = new QuantileDMatrix(tables.iterator(), Float.NaN, maxBin, 1); + //set watchList HashMap watches1 = new HashMap<>(); watches1.put("train", incrementalDMatrix); Booster model2 = XGBoost.train(incrementalDMatrix, paramMap, round, watches1, null, null); @@ -86,11 +106,12 @@ public void testBooster() throws XGBoostError { float[][] predicat1 = model1.predict(dMatrix1); float[][] predicat2 = model2.predict(dMatrix1); - for (int i = 0; i < table.getRowCount(); i++) { + for (int i = 0; i < tmpTable.getRowCount(); i++) { TestCase.assertTrue(predicat1[i][0] - predicat2[i][0] < 1e-6); } } } } } + } diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java b/jvm-packages/xgboost4j-spark-gpu/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java index 4293486a97b2..af94bf975cff 100644 --- a/jvm-packages/xgboost4j-spark-gpu/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java +++ b/jvm-packages/xgboost4j-spark-gpu/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java @@ -20,8 +20,6 @@ import java.util.LinkedList; import java.util.List; -import ai.rapids.cudf.ColumnVector; -import ai.rapids.cudf.ColumnView; import ai.rapids.cudf.Table; import junit.framework.TestCase; import org.junit.Test; @@ -36,24 +34,29 @@ public class DMatrixTest { @Test public void testCreateFromArrayInterfaceColumns() { Float[] labelFloats = new Float[]{2f, 4f, 6f, 8f, 10f}; + Integer[] groups = new Integer[]{1, 1, 7, 7, 19, 26}; + int[] expectedGroup = new int[]{0, 2, 4, 5, 6}; Throwable ex = null; try ( Table X = new Table.TestBuilder().column(1.f, null, 5.f, 7.f, 9.f).build(); Table y = new Table.TestBuilder().column(labelFloats).build(); Table w = new Table.TestBuilder().column(labelFloats).build(); + Table q = new Table.TestBuilder().column(groups).build(); Table margin = new Table.TestBuilder().column(labelFloats).build();) { - CudfColumnBatch cudfDataFrame = new CudfColumnBatch(X, y, w, null); + CudfColumnBatch cudfDataFrame = new CudfColumnBatch(X, y, w, null, null); CudfColumn labelColumn = CudfColumn.from(y.getColumn(0)); CudfColumn weightColumn = CudfColumn.from(w.getColumn(0)); CudfColumn baseMarginColumn = CudfColumn.from(margin.getColumn(0)); + CudfColumn qidColumn = CudfColumn.from(q.getColumn(0)); DMatrix dMatrix = new DMatrix(cudfDataFrame, 0, 1); dMatrix.setLabel(labelColumn); dMatrix.setWeight(weightColumn); dMatrix.setBaseMargin(baseMarginColumn); + dMatrix.setQueryId(qidColumn); String[] featureNames = new String[]{"f1"}; dMatrix.setFeatureNames(featureNames); @@ -69,10 +72,12 @@ public void testCreateFromArrayInterfaceColumns() { float[] label = dMatrix.getLabel(); float[] weight = dMatrix.getWeight(); float[] baseMargin = dMatrix.getBaseMargin(); + int[] group = dMatrix.getGroup(); TestCase.assertTrue(Arrays.equals(anchor, label)); TestCase.assertTrue(Arrays.equals(anchor, weight)); TestCase.assertTrue(Arrays.equals(anchor, baseMargin)); + TestCase.assertTrue(Arrays.equals(expectedGroup, group)); } catch (Throwable e) { ex = e; e.printStackTrace(); @@ -86,10 +91,14 @@ public void testCreateFromColumnDataIterator() throws XGBoostError { Float[] label1 = {25f, 21f, 22f, 20f, 24f}; Float[] weight1 = {1.3f, 2.31f, 0.32f, 3.3f, 1.34f}; Float[] baseMargin1 = {1.2f, 0.2f, 1.3f, 2.4f, 3.5f}; + Integer[] groups1 = new Integer[]{1, 1, 7, 7, 19, 26}; Float[] label2 = {9f, 5f, 4f, 10f, 12f}; Float[] weight2 = {3.0f, 1.3f, 3.2f, 0.3f, 1.34f}; Float[] baseMargin2 = {0.2f, 2.5f, 3.1f, 4.4f, 2.2f}; + Integer[] groups2 = new Integer[]{30, 30, 30, 40, 40}; + + int[] expectedGroup = new int[]{0, 2, 4, 5, 6, 9, 11}; try ( Table X_0 = new Table.TestBuilder() @@ -99,18 +108,21 @@ public void testCreateFromColumnDataIterator() throws XGBoostError { Table y_0 = new Table.TestBuilder().column(label1).build(); Table w_0 = new Table.TestBuilder().column(weight1).build(); Table m_0 = new Table.TestBuilder().column(baseMargin1).build(); + Table q_0 = new Table.TestBuilder().column(groups1).build(); + Table X_1 = new Table.TestBuilder().column(11.2f, 11.2f, 15.2f, 17.2f, 19.2f) .column(1.2f, 1.4f, null, 12.6f, 10.10f).build(); Table y_1 = new Table.TestBuilder().column(label2).build(); Table w_1 = new Table.TestBuilder().column(weight2).build(); Table m_1 = new Table.TestBuilder().column(baseMargin2).build();) { + Table q_1 = new Table.TestBuilder().column(groups2).build(); List tables = new LinkedList<>(); - tables.add(new CudfColumnBatch(X_0, y_0, w_0, m_0)); - tables.add(new CudfColumnBatch(X_1, y_1, w_1, m_1)); + tables.add(new CudfColumnBatch(X_0, y_0, w_0, m_0, q_0)); + tables.add(new CudfColumnBatch(X_1, y_1, w_1, m_1, q_1)); - DMatrix dmat = new QuantileDMatrix(tables.iterator(), 0.0f, 8, 1); + DMatrix dmat = new QuantileDMatrix(tables.iterator(), 0.0f, 256, 1); float[] anchorLabel = convertFloatTofloat(label1, label2); float[] anchorWeight = convertFloatTofloat(weight1, weight2); @@ -119,6 +131,7 @@ public void testCreateFromColumnDataIterator() throws XGBoostError { TestCase.assertTrue(Arrays.equals(anchorLabel, dmat.getLabel())); TestCase.assertTrue(Arrays.equals(anchorWeight, dmat.getWeight())); TestCase.assertTrue(Arrays.equals(anchorBaseMargin, dmat.getBaseMargin())); + TestCase.assertTrue(Arrays.equals(expectedGroup, dmat.getGroup())); } } @@ -140,43 +153,43 @@ private float[] convertFloatTofloat(Float[]... datas) { @Test public void testMakingDMatrixViaArray() { -// Float[][] features1 = { -// {1.0f, 12.0f}, -// {2.0f, 13.0f}, -// null, -// {4.0f, null}, -// {5.0f, 16.0f} -// }; -// -// Float[] label1 = {0.0f, 1.0f, 0.0f, 1.0f, 0.0f}; -// -// Table X1 = new Table.TestBuilder().column(features1).build(); -// Table y1 = new Table.TestBuilder().column(label1).build(); -// -// ColumnVector t = X1.getColumn(0); -// ColumnView cv = t.getChildColumnView(0); -// // -// System.out.println("----"); -// -// Float[][] features2 = { -// {6.0f, 17.0f}, -// {7.0f, 18.0f}, -// }; -// Float[] label2 = {0.0f, 1.0f, 0.0f, 1.0f, 0.0f}; -// Table X2 = new Table.TestBuilder().column(features2).build(); -// Table y2 = new Table.TestBuilder().column(label2).build(); -// -// List tables = new LinkedList<>(); -// tables.add(new CudfColumnBatch(X1, y1, null, null)); -// tables.add(new CudfColumnBatch(X2, y2, null, null)); -// -// try { -// DMatrix dmat = new QuantileDMatrix(tables.iterator(), 0.0f, 8, 1); -// } catch (XGBoostError e) { -// throw new RuntimeException(e); -// } -// -// System.out.println("--------------"); + // Float[][] features1 = { + // {1.0f, 12.0f}, + // {2.0f, 13.0f}, + // null, + // {4.0f, null}, + // {5.0f, 16.0f} + // }; + // + // Float[] label1 = {0.0f, 1.0f, 0.0f, 1.0f, 0.0f}; + // + // Table X1 = new Table.TestBuilder().column(features1).build(); + // Table y1 = new Table.TestBuilder().column(label1).build(); + // + // ColumnVector t = X1.getColumn(0); + // ColumnView cv = t.getChildColumnView(0); + // // + // System.out.println("----"); + // + // Float[][] features2 = { + // {6.0f, 17.0f}, + // {7.0f, 18.0f}, + // }; + // Float[] label2 = {0.0f, 1.0f, 0.0f, 1.0f, 0.0f}; + // Table X2 = new Table.TestBuilder().column(features2).build(); + // Table y2 = new Table.TestBuilder().column(label2).build(); + // + // List tables = new LinkedList<>(); + // tables.add(new CudfColumnBatch(X1, y1, null, null)); + // tables.add(new CudfColumnBatch(X2, y2, null, null)); + // + // try { + // DMatrix dmat = new QuantileDMatrix(tables.iterator(), 0.0f, 8, 1); + // } catch (XGBoostError e) { + // throw new RuntimeException(e); + // } + // + // System.out.println("--------------"); } diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/resources/binary.test.parquet b/jvm-packages/xgboost4j-spark-gpu/src/test/resources/binary.test.parquet deleted file mode 100644 index 5897b6fadb2b7a9d7081e7c96752b414ddd301c4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5417 zcmbW5c|6tm_s6fDR4Oi4n53*p){uMO=PIR$N~@%@H7T-0MH{j&DT+!>QW_>BOj+W- z&(&s$HqnquXr^dW8bbNqThs7(d>`}2Zyt~Pxc8j%`JCtD{eGX^sEG013>=yl#m*}%#H|IB@OkA$N%4Oj!J*2Ml25N&`p_+(yrSkpk_DQ($Xr3e4JPjy~P@fnQZdRl;@gyzL5nkmZ6M z)e(d_o=rCApG8Jb4d~hLhMChhLUGt;POhXOEYfHt$2Il9%c_=ab0v^C%NQ-w^hy7` zO}zJw(Rf8W7UcUCA#BB4;^_LE*gbRrjo&7N?!HYpJVq4Fa`%9ch)nRQQE2MdfF@I; zQ0ne{*sZV}?T-`_m*(e?-dT^b`(udp${9chm&5den{eY^}N$NE7~@Fc`{-ry<{h4CrTC>xW6pZAut{Uis-_}h*+(ZLU| zTPlKD-%;o}{R%4`3UP;88(9@#mqZE2+x8vpDqSGMxUHP7;gcQ0lom;Qf2x{COTP-NqHyjq!%TAKwwpH_15D z(gXidU3XDV|G0lsT~L=8x9;j|yy2u#vFe%*X?QJ*F3Y(f_S_YA8OMUZp#wDJZH7&D z#W*w78eGp5z)JoAI$cV9M$AY<#R7n+MiUEFt~bRPI27EJ-`C0vnkNac-e6glARa>&%<*@n#x|>KuYQ zVU?I%RfWpcr(h2AAWSV}faB_qgj>=GB@Qb5PgA`(hAjqIvH2xsEX?K9ciMvD&#Aa+ zx*DHnrcJbiZ()mh8@!w$1Bb>O;77*B5s3&1zG9gQ|6b4q94Jab?>nW?weSgV$wdYJ zt{qSD!Gn1yBKZ^Cx>bsIDr(SkU>E9~7>}lM1bz?erUUw8M^yK`>qM#azo~BE{t&%b z@QNCS9Yy=RHYyvMjcRJOx`Q?00~gU^z^_J`IdhHUe|TE7BY6fHuD# zfW$TGcq_4nzGi!1^XGlEen~c`d_pO?SD1;L=9|LL^~sPNKF2Ca_b=*vV>fA5QGl6h z0VJzKoQzo?0&g!a#E`y=qGxD*3>rccCD&RVFN3NWT^5-`hxuqrMZeD0`Y zLazxmP&rESJs;4DWj8QqqZ&TD@Cymu{FJh$_|TV6LtdJ?S@D2@5L@K%iR= zu4!FOCTVq%jy*%P>x&I2r7ni2;oATg*Yg}|wIKAB8ENQ^gMhn#5xb@+XpCy2VPYJ7 z7XBm7nsSw@^_WBI)@sVk+ltP%TCii)HW;H>fMcp-@Y*URh+A@=-tCiuk}uAf82FMN z4*Y|(<ge)+(>!v@w&gVU{wyeQ8HU?&@Lll^BSw3j@Oy z9dt@k2)MFA(k==%#!5O{eaic_b!eGe?@4z`NGg zj_V8Fpj5$q$U0UFi+@lcW@kI7v*cvj_>&ABo_>-n)=^;@k7YLgyy9ggG_Na$Mk?%wgU9nQleqx893Rq;OJ{TPLmNrHem1h3 z=F+CNB{WLxC}h61p}gwJoC}cw=)7qfhU|+#n+H`S_M$Z$$ry{f{#=CCCmU&+?q*nU z;|vCkRVQ=v4?>V<3+2%`NDN`o-3uaduVfQXbFdzKcWM&b#OpZDAq{4$JAp`F9>l6f z5%0AwL?X%=#gY`cRWI5>Gk7K)8Wg4CMdrvgP6UTxDX5f?!?K_Ic=L8dP#3lhl-Z|a zE+^^AfRhgIpB?saenhXtzo{;uKLb>LxI-$|_CY*%F3>rm+~AYNP;9sXGe6GbUVeE9 zL*J)jdZ`we9{0wU%yblwO~I_JG;oO;!oW%P5WIN-IHu-8waR>mtMR8%%x5$wq#tyn z?jb|z9>#2y!vybIayvr@uSwbC*boie9Z*4e(Rp~JJ_SlT9?*N4k~sMLez?9Y3Ga)C zqe0smJdv{jUDD^_k~iTv_4hJpedIcHC}?BtQJGC5gYB;H{i(R2zXjF86{TFB@Je~IEONs*!U?PvR6EK;iKAq1uS{mh2uhb7=FtU57kFP%#NuX>4OaZirNHpY&eL|xc%T5IDs1~`vR&` zGC}jR8)oLc#>2-E>TnCbJgyFZ>XwqK!HN9(`6~2vyc2EqZ)ckc&WvyWEQi(`3rP51 ze?U@hDV~WnufT>Ncsl7SZsdHz8JUmBy6we~cJeU#G&g1Mr4#5%fyZY*x%r9WrG(>ssoFK2V{$4k8+ zx|dLih6AvZcamIaL9(~@H!w3Fi*lJuamN8!?hpA{?9)A2bZB)SiYRqK_~%3P%Oh3D zD>s9BwUu~UT8gXu+e*5ifCH6zrr;-SO<#Qq=e>BGME6J3!rM9+`&`U}cW9ZsTESz2M1jp)HI2nzt2rmwz{AWeHq@z$VsA~s)>F1y< z!xv(ANZ^yJp;&xHmDF98M2o^2^bNPd?dhM0+(RY&v`v*XEtNv+@%r5Rj1%;?NDkHe zokPQ(xk0l|FnpMvg`B2jPWC_%uxh4)*zVKF6lYFgDljkpbtO)YEpWsc3~5ycBMbzW zSB5~67ECttNngfM$*;?x__yn=s?XndTM=z;c|pkg4}oz(zsVlI-@bc|qOq9rL{Vjy zxbSK#_z~WGn4-#x492&x@JuQQ3y;;K!@|R%AS^s@j1CKTUqM*7p^Xj;r=1`yTxCax zh2t&=3%fr$Ea*s>AS~?E-(ginY33^#L7N!Ls*1fbBX*noh+U^PVwY-<*yoH!>{=_P zj}Rl^CN3Pi-QO*6>xS(fo4tJ9nBy5tX-!dI%)S{CK5p(_ zJ`BYiQDs%-Jz9d*Kw8;MQktPGBmN|!cwR*CRO^tH>NnvEzXfF!K!q~?<}sA@g%eg#UqO+D zKMciauAt1dLbhU;l|Ut0(u(~S1ZEn88M;?ZP@ZYa(^6_&MzO^mbN1$yV!DB(5m%he#q{f`xd-v*( zq#FJAROA1iYVzMxP5*nU*?&&`x=j2(vt9QxSayF`b*68i`rk58H!?6XF)&isaS!(P z@l^M8H#M>_F*3CDG;=pMG4=8=votaEv@kdH@O1Msnr-A|YG~o<>F%bhyhpHI{O^Y$ MFTr3e75qv1fA&DOegFUf diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/resources/binary.train.parquet b/jvm-packages/xgboost4j-spark-gpu/src/test/resources/binary.train.parquet deleted file mode 100644 index 780efdc13d363a1509b8faab61a8a1b9bb1c0615..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5511 zcmbW5d0345`^TqDTAfnOG?9=*B<)f?pL@zdvPYsMZG@4u2t|uZrD(H7%GRc`RHV%3 zo{}X|C?p)ju|$hqiSwJ`IJ&Oyb^iFB>zZqx`+h&q{klHSbHC^Fc|J48*3p>5;?&D? zn&mk+v+smzaZX5!h5j5l7VA0t{W9r(EEem8wA8>DOEmn`6S`_5&|g^|L*F~n#t%=? zOyeU-&KnEE=ja%sX#H=L zWjP0n2ld$G8?!bq^UyP9xi5| zbbyWr5ro9H2_GF#Kv%P9;&&zj8d}oO{6K$EY4vd`r_GSejrq*$Z|y{H!!nG2RY{fj zDLCTbG8@~=A4ynaB=9Z$v1p|M9MYPP4_|(zi2{Gf-gO@y_zr=_Thg@nEEfa(7&sWU zAB>Zh!<6$2!LvIawno-ci7gr81<=g3|dPi{EW(4+~#bm@9MabRqi7YHxj;^m( zKt{M3>}gbkEZZiqoUtBUFa_7i7m<-Or()v(9{+}79KPM>h>?d|$@``la4yaMrfmg{$0@K~xEy5D@?nhk6p)^63@?T}5w>JM!jaL^RC91ChP%pQ?ZDqeF9em4 zxjz}CCPly;Sq+h6>wSD^kpfFr)uCK<0;(pRfa29dVbE`UVERu3k5L+;@!kXYGhPJ& zLIIW#Rng&-Nv@oRLl(;jxseCxGh^%`033x;n`Q$g=3Wi zP}<}&&EKwuYic&ZZYwP?YZ!xx>kP^BH_xg4C0W!|n@tRNeIk=y`jY6O6f5t|LWf8_ zh~)Xf0+|IMeCPpX?)Ko~t`4`^fjHZWjV_PuslGxpy|Ss7M27h>3w*nL0B(D<2i%OENkN`IOeva%ZufGDvrjnw z&Ws~DD&t_rl2mv$?KkW?vK*{8e4{OUgQ$119hBOI!_m}FJvL!DexwT+`;{dS z53z8oBm~%WFWG%@GID~t=*8VL;c>jso zrIP9XPOt~2kV7>AIQR8B*cfFGQED2n;({U0$k>Oo#_GeSwgT8!F&B^SeNDC;&7rGB z9k^>wFzG&T1A0Nvm=-lQXb6wvTBkvTi!D?wOr9Bf-wvL)ZUZ$z1dd!=OSDXEKr7WA zE^Dhn6`uv^mj=Mr0lTrw%MRW>FviCX8^A(N7kbK95i&*r>k9LPpKgYc*+)F^_3T;_ zGQ*!{UaSO-4_fHepz&)e{E&FJQ~uvvD6Q>=pT7)&FzuZ-JJsT$Sl$79dhK9gSufeL zA_QISs)fqq@^MU7j&&RFy-j(m60jQQ(8)DUOs#wvoqV$ud!{Ua-F04|=^iTFI#pTN zIX@OXrhh(H)FclztmVwi6#^Qel}JScUO?>NO5udeTNrsml|ONS4z_n+hLvd#ghx+l zz-Pr0a5Z+oc)1(6vdWk68w-W6EtD}WV=`WN%n_Y*Y9edRGU)!3b@X(K5t;;^#fOvk z;b5E5(34>ZYj*4MuLNJfEnd@T(*PydJCK0!?slyAEMz*g{=^iS*KmFAZ=z1ecPQZW zV!CVsZRz+*=1quzPZv{AI*SKZm)Z%OPr`vK2GV}!i^&MzLX23P3i7?vf!m{sZ}qRk zX2)$*X}B#69`1Lhw;>BJNCR2D_Z6m>_?M*b_EB^Jhb==_N=fzhg)HSF&ioAB;E25HIFQcy+xi zRyjwa+qZ*4&%)ERdl6DER|>7~o$1gaY;baFCfzCFXuBwf+)I*zn%%cZ&K@p*=z=jA zBO8sm6FZ5o@o2borib3=YM>rOVO7IkWWrgZITg;-+r<^mDprw;nhwm9)1$FT<~}*t z_>Gb)YN)0x(Kr|`DoAb!q+EO>o+ z7p}Y0fg2-t;5u4>2NTOs-iE>Ks1n$xl?u1Dbs3jE??^@bc$61>W$as;h(M?gWOynj zJE-EliS6XW(|Yi#DQ3#gX5#7Fb>P4!e3e!U$gtakxr0KerLs3pe0CUAGed+nLmOaU zo-Q5LwgT%0+{Q$gL|A|C0Om~J0F6`@d>FXm1YV-JJKq5H z<+jtw9iMTeT^#y{T>*;#Y24`J4cg`7MI8~DpwYA#d}9~k;F_%%Giw|~PlZP+iH{}6^odjb$8)~U^C!8A6|gH=dWSr7&kDPZHhZX#=)(JQLxljp5M6a4Jlxkp~|4$%!O+ldeW4G z_IddzU2=v_wmpI?mHi>>QZ@K)loAON_o8>n6byMTBYJscAZ(tQjIYXH(^=7VFk5dK zxDopV{^c#{t^UF+YSYKGS(G?#*^Yy> zs!3aQ3cY(TQ&{PA4Ufi5WZ_E-~)RJSvsg3Ns{bR9a z|JXfi3I5(XOO2}5jHv2W8If4m{a)bnOiB4maHZxzX-ytmvO^XhB%3;pv?iCu`XNhR zO~tZgC(u`xyqt(-$-6>dS+c^4WyvDiSC&i(Vp%ea^_3<46U!2N^_9gp5+Rl)H}$uy z#g*eED2T_z($wOnDsU2i#s}1Y>3Q0}^wQD4Vv)(OIF{iEBp5L_(OxOg-y>j^d!U!E z&uS0OZ!C`7NNJw5m(kC__-A19Gcf%bnEedQe*`iD4^JNfi+e&^Q%f^UPfm*?r)l0_ zj-@H#aiqmMH$s7%pr9vb^Wzc!c-~yr=xAVm%&P;zJ4USJoF-Ai*_%$Hg+=aqRwb zn!Fz^|3N$;#m$k@i)Hk#+GzbSuH;+XMln=k;~yDI(?Ifr757)%WXXZWt+o}nIhHRT zDmRA_yF^cp+s3GiZEnEhtqE9e=;7}ncBkQYqZ+RB2?$)V#*ZgsYG`g~%;M?)&lQ1d z1%_fzuMPGIG7MM~Vz}BT$iv$s$b)yTU#L#d27ezNE1kfgfMBm69Rrl@sM2Q7c3BaTOd9yuJbX_`H9rA3f;8H;8&^fe=jxp@1>^y zz0~Z#mzw|Q((lv6|Fha{6N~5YcUQ-)4jA^g4Tc#T8k-s#4;$?nyh7kT%-hq<*wWP4 zXri~d=LAzTA20KXrbgbD6O6pPJ$#HUjD5_EEWN!wJ@hrh#LLD1epqTUEY=+HU$FlN DCJWfv diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/resources/multiclass.test.parquet b/jvm-packages/xgboost4j-spark-gpu/src/test/resources/multiclass.test.parquet deleted file mode 100644 index b8347280f9935d131d9557dac06cdf2558b3b304..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5374 zcmbW5dtA)<_s6Fxx{ynm*&>%hkxVsRRr5LTtV^_!yCh6E(p49$5}~Uog%p+CQmZz( zlxRNZO)XI<6n1lGL$Op+Qhqb6t;gg0*gt;zc+AY_ob&md=i~E!pV#Mn-tTd=oo~$J z@Di1IY0A8gOZo;hmA`G`r#Lfjm zGI-e#^3%Qk?9&P_k?$34`fI;^VE2a?6ST&Yq#1Ux+x;G5czZsaaeIbu&4y5wRvmo`A3rts=j}KagQPUWftyI)wlEZk+$@#{=I%lW!(I$^BKNa(uZY zPm@PIg5*KFn4r$w^Y~)feVEx0Lw6daVXX}a{n zh52Z0DaHX$Q#J1g$=gal{oTkStcimacpF;F1HK$IoGZz)<){ zWl2Zk+7$myWo{FVnDhfN)T<ettHMI_RhYUvmVOCh*j*_PU=8mGmX;;JWwR7) z>|BX@i#IW;Uh?F=s2$TE6H?{87pAGXNE*j(qv@x=fU(sUnw5p@HPsTRdfmyyP98&N zu4^DRp~@tndkoz-(j06(575{NnKbptYLe75hs+VS!L`HDC=7i}Uvyjp?h%T^XWCJH zn-2Czt|c83`ZK5>jFs$BM5s!LApSc)1J|Gu6650N!RjKWuy_oPO6RiU(v`{507II= zHDF5jY0{5pv}y8!j}YZ_6Fvzhk~bBOq^9W<#?Fgm1n!!UGU6#~W$R5kp9|=xlpgRM z>qSB?4WruGA2A}_68cxGl8nfK#Om2p(wX%D-*k%T<(84y*6l>e1$(?_IuzE7ai;Zw zt08^(Yq(|2#pdaM;)bW&*y_zK>~DEXG3jFu>uob0J64XOk*+G#6O$!rga?jOEa7Z) zJ9O1A$F6(#$@@B1(myMnq+~XO+Ji6{e8d7i=p?`chg>ko$VE;=A^dPoEa8p5%{VWQ zlI+miMv|8-W;{0KKw*R$GkN8ZoN|al07TiRDY17g^!;RU)iYIgOW{ zTOrgU2=i3=B-1tly-<~IteS({s$y6Vw4y|H2fY+|mkgjo>sWxeb%+qfmA?79?99GWO8OxKU zZ@MJT7j59I$#dp#&Kul1W)XaJ$wAL0nV2;18fM0;lE*q`xVyO#(@JK-upw!uvhKRy zTmH3|HLFf%)_1tz%((Y>^ph5tWV~f>51WUozY>!9cr@(nr%o-G{KoJS*P+7iicr69lneIS@>1(j2RGYX&USJ#Z#xwffA4N><7RVK@P#G(Mx68)J;qsCJ+^IF*Na{!7IZ=(HQ@-(n82Qr>DF+$5nl8q|L zqQu~Lc&hO=tl=g}=vY(yLPnBHojh`)%N~^L1nj)!`y}>?r3~4iz)%-0GU7lmsyMp9 zoQF|RQ@n@$T&+kvr%RydWIJSU*$Vgca<7IX9Xjh#HCh~finXdC z_~+#>_|uGWuySD@q$`%fmEjB?PjVNvN>+ofpo#I;iNK2N_joQciuvJ)C+vPZ60?#P z;*YjMQFHMVG|YS?@jO!n7bjK0!@?N+Eb#>PHOXyah=I^G5!ejsSij3N2L_c%YV zp5gzsm!YxBuwl_r#(QQFI68AN+g_8j-dT-L10C^7*Emu?&6JL~u0l>8RYfEDcy{v$ zJ#vS47SeThT)3Sy2!Cjg1i9SPaQ$c&`ULL9of-$(Hp5ED%nU%=CyQ`(XCV7*PLSm8 z@iWMI(+TWmAxU_xi9=rKp!ucg$W4r;zkV)Y_nTKh{nZrMJjH@Era6)G%a+mbH$P+j z@SV&I6*IIsmO;wKQb!>$tW&92=AuavHZYN9DXdG{Y~8vY8D0oeQ=ERf91=}8_pvq z{fEQuu(_a;9t{@Z_i^dri>P+@2I>zNf@F>Yeir%ANV#q_6fUGC$NiW^_Z;DH@Esgj z+Ri?ls$?_C?{~)bj7`}w-}|ljc;5L5ClwUY*T-^TX(WdxusqUYcMBax7NCvYN$5c* z_EkY4Q!+%Js?R)w7W0!NcV|RV1*;6kTi=QL6%3~Hly2b6HVG?A@}r#>6CuB7E&E5g z0=e;e6Bx`IOb0bjf$PR$bXB_*tr!%;m~1&n-zgNqh5e&&WQP;8=cX>Hcvuc!T4QnX zo!y{vb3XGjq?Wm1zMXBFGl@}ND<wdKs%P((b@nJ*=C;wjfW$Nb=X$Y zI_wQ3 zNP3dBNQ%8Hqq^oYDlV`TO?xzyzSsEzGyLY!YRec_C2tzGg>h)$qED>6;XraTcs#7$ zX-IceWWzahAVrUlz}u@$Ff%y^cMH35YqCIJ>E=>S!ASZW zDK&}o8!791n|M)jyw)e%|5WsE5^>TjG;$*u^UYft36)zDC z8&@o!turLYLx0#%2erK6Q>HfS^4#L=mYM~ux@4AbCuciAs5z9Ysi7d-R!I-C-5-yu z$>(ssg=K4R*;(mQy>D2ysFH?dYr?)^*_%)rmNoIdVOiUfhGku;Z&;=uX;}97^bO0# zEe*>i;_t8)Ux`<(EWIa=rWXH>a<84O-fK5&_1fojd+jKrUOU%{S1c#vNWnYo1_lL) z1HD%UxvlZ=74rshcuM2B8eBJ{?}qVrL-^e=`EHngH_W~n{d~l(9zGm?Hdj+ibEB@b zLQ0xu9Hsu6G9HgBjq@)l^Q)D0m2AGv@wW-*b0&P9@o%1VbkZ_Q?~Nk`F+4v1p*2sy z(Y>SGmuB3Tc1Kx?lhsB;K~^XkuGi~}D`t2a{2ewjT(){&oUku0TV0Bi)l@@4R$v*f z*XxVBA;X>jifh*Di!-GBL{*>Wfe#M>B?Ta(*i#w+)#mO|J zp&%2K4A<-R#r4Q=sUq26qm26E%=+S@jHEc3_B0e^LX_cpy}r0qkm4%7;&QFN;uK_s z{Z{gC#279=k*k|)rCT#a>zi_AztS>Fp)wtR^EjFY(nnrFT3>0AWd{!bR~frS#^$G3 zOI4z)#ILs=Ce^uthK7IOYJoUFELErAt55~&JpzNg{QWfgnF!1T#vF}_|8qsqIv;^l z)9XS!f(3#8KMQ<4g2nFQV6jG-e7H_-^Zx*6`j5l_ diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/resources/multiclass.train.parquet b/jvm-packages/xgboost4j-spark-gpu/src/test/resources/multiclass.train.parquet deleted file mode 100644 index 066f31b0ffa3a3ea1b7a8be7d1494a62b246e032..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5668 zcmbW5dpuP6|Ho&>@I@jq6XmwZC6`7+SI+yRgc7w~R9h)Bm0S~HOKT~YBvcC7hze0D z+EnJeKf2g-U23&$H@Q|?Q8wN8H&ff@_xL{ckMBNa&N=VT>zwy<&N=Vb`JB&ZW;xHc z6L19GDuP!kg6}u?9aGg4RP$ttl0ccm>F4*Y<0)`BoN6A|u%wvjyHmz)scB}TeG8$- zXEYnH6N`&ycHmlvyReSmhsqj4x=kdUILc}|l-@JLNx@^GCYWLbRROmXPEh|qm3=+u zG`4?d1hY;@vzIEDfWFFk%*)jy0(i%0ZhyqYm+Lbx`c>%Bc2%mj!WWwZgV+m&_6*ld zgXU#);I!M$Ow7C1&bHDa+uz<7!37aiHGjk(~bLYGsAbaV%b6^61c<~KA4`uTK+=Vv!O&9 z6}*}LHaG?}yxy>NmM*yPk`@_N9LKcUc|h8as?^`C2YOfj!1g3;Aq|IX(Lm%6;UzO5 zP%;j039_g#p$tvWEQQ(U4RBS2E*MO*fR>+sWVOCw(R}q5xG`)bRFnZDxp5E6hUVdq z8>3L?VH?9LRx@S8^D*Oe5bgQHoW!p5qy1fJDA;!Z+#Z~TirON!>P!i6t&RZ9ILU|t zrlLjlPE;Hg2$wo8W65Dtob<>Y%8qcUO7?JK;w+NJ{ZtEmWsNZJ!hG00Oaj+7FLY{J z{x#bKV{ef5JDUsaO>ZFjn7oAq-gIS&~y?(b6I+q;8zg_wM=I(v}=t^T3 zb6D!VO`6?cgtKEU>DiF=XlSe_oqyAdS+{RJ&80u#>O>P_m#9f9f1F6Rb8B#d>K)jb z7D8v-x(``duE1@-!%Dx?CcGzoSQl$axEJqYeyfPE3ePaVK7dBtXaJ4%v!G^4KQp=R zAdH*#JH|HDVeHyCSg&S6R@t_&i6dI2o5fBHX<7 za%)s(xR7_Y0y1%47g*mvibupTOk%7lj>+1_?8$e=8)0$eLCr0A9X*on_jRQkHk`rH zHnXM21h&LB$QgVjw&WKjO-AESMVj_-9&6q*7ZS36!2ER~_3y7~xRX)>+i z&P4;eKys~d9dx8b(T?*#HH39Acz+!ZmITw%m*yn!;bE$L)CRT;cA^5))9j6j^HIxm z2#tJOL{p|;hVrypAO;2?RqljE=SLFDnKlH@o`Bul3UCtN#2qW+*=rf4Ab6z*2~FBG z=PPdrd>%-`id#YVtr_Lpr+~Qe1PbFGIKA|ahNnBOL3T$elkI6nyhYvUX<$QRxgm5% zg*qCos$_n<)C2wR(;-OPidZd5CX1(EM6XMCnVI}6aPLkuIQo1CaZL}IH&sJvTU-pj z@z$ZITk@%f?kbpa#)2|SG-+PJDR}s#1GP*hz_w?0^k{)Uv=b|mEqm(!EM81zh@UWV z-}unKUD?q2{M6uo>MM&(%h_w@3Xm9{%N+8)0~*nU>@`h->JPbW#XAYPJHrq<49po* zemjO2T*YyL-@ziadMMMbfZyZFaMZ6-T&>^%mcFmp?z*?&7T@HgxrayA7u8`{(S77R zHX$|j zjsbn>WP8GrEN=XTt&cP$R}2>b)-u$LJQ&HI$#ZTjzwo5K~>%hL>{U)kciaav@|I|9XtT14Nk3tV0Lu`Z(s zcDRp){we2KwqqFKP1B(1-JaBM%Oy-M_9f%ECM0UFCiBQd0~)8VCb9i8~nueb{wqx>Acl2}R&Bo}yO9*DyO?lRmwkhPJa!ur#t4PyM+S@QHxv zYdv8X-O(idZV7C9RuhPOjo2eY>T%7o6d2lafH8mn42PQxBBi05(Ag*)tEyfw4_BJd z*AWMCrP(=f*OoGzH$TIpRUSkvJ&E0}6^zUE3s6_>fj8O&G@|exiJYDcEPW_);sTyL0A)-(^O05u&Xrw@eX;JD~0jVdWmpp zGLzkC&t#N7W+x6;$BjxGV4k9YL>zdDyxwH&tk?+~pR16J+26xBtv>LZtA+PSw$rXx zX>8k9TG(=LJQBrZ#?Sv1lq6c?*kDuo&Nu{5=`}G$n@5oM{0lfn-3L0tjWE%s3WZ~u z@O`B>y=(A2j&&?&OtejL$;x!Fe{$1l$D|7$M)el!EUcr?GZy zGH&^fN0+OYz%?aB8qe5}hVJ+H^ic|&e4oNrSEaz8JarmyYbveiT7j95Z!@GZjHyq# z!3=%rB264|3N4DxgV^B;Dje0to347KlJ$VAAs0X&icwG^W%8*4dD9@IQ;sA;W;_q? z=M6{4UoByopE=3luZ5`z?YPL=5!KH|V^+}+YT3;pEBThJYqmb|t#M|KSq#Q`rC;OQ z2@B~`K`PwxM@V{52iD7NFfl`)YIGdJmeP7S{wk48iAuvvMFG8AJ)Ioa*z1%Nc8a0H zhY+>B?rcbPE69N_6%;gO&v%=08P!jQZkSrY7kXRy*b};gtbk9@7T#j~+j2I?$T(AHa*-XI0eMY4HhhFxvg$-UX97>Mqe}GW+b5K9qhgiJa zMDb!gZt!4ne$7l6`NR~i#9qZ=b<6RadwX%I&ZSuD( z`}XX}_{YA|SYZns`LG`pJhNHvSBr4@y9V4*WsJWhtJAO8Zd{h+MMHZV(P~2$I1Du> zqMkaOoH+=1ol9srcRi?H{0;WFce0JR6npGvIk`hFO!Vkwdo9O8{Rcfkcbb8H=N@>(%Yy8K=RoBb6WUvv0Y8X# zu^l-rEN7`Mxn*yQl2^;8{Vh)8#8#~0CVZJ=HC+8{EO&1`OaGPUJa3^HR;d&hWo$6Xg1 zs@e<5UP79mIhJhBGJrvE^$0WaTNrly3bU8HnpRguFw@SBB3T*M=ny!W3EAWiDc^@< z%h_w#>YR#Afn89XrAxAJK1Weq7uIfe#G~I7vw^MN5K|k6_YbAgQFF$V6T^;T(L-gr z&u9^IQSAsrro6;WJDvf5`YG@>oe#yltuXIOFD$&?4uS1_N?)|Ipk@Ycn1$kLiKHhW z0nJZtXGUZQ;9JEPXgSh~L=Bov7lc1S#pdy>^K?g8Fr^W*)MBNg<`Kj#aXZ_Z!Xctf zg&29poEY6&Mg@FB+rGPr`S z?qAFJA7A$TPt*0OJuZJu))g#?y{k*^fn}T56rC+Gy>x7!Fp`uEc3(7onVU_y%OU4E zt}Qn`11?)HGcuZ-8DyCqtm9sIdB6H_{)tgMj;KG`?J206n?{EH}EOUxXmYZ^*EW7$Rnatr7e3tcu z%7SuL**S4^^@LTbU*cx%FY#8xFLAZym$*&*CGKDZ=ecr@?Dw|1tE~<54G&zpcKIqv zu&+R!BTyd6)8Q=_eGcqC2V*`5;?IHo=fL4ppcv%qCkf&Rt9iP5x*N=t^#sbg4jg4A zT{%y{lj*`jRbjcRnX==jd;IB!3po}a@A#*d?CNCImYo}iBYZkpAQT=V0$Yw*m25iv zg=RO9R;4P#$!nv-mlsNo`_dYSi=QIU5&rrS*Q`AdH)bHNSzCsa*HniuFR&c0j$F+RKwTTDf90uar#4?=RdorBNh;rPQ)<9f~9G5X!ez6_QM;u>X*iR+@ zMBK>}=JI5F%`zChPtKLEWo48>hHdTM-oMD?chEGDzm_AlYtNp2XVtGtzkC`q)_OFR6C_K6T8$PZj_BRQrFQ>hP~q zKXw!UWA>6*j?VPIs%jG)ZuFN8jO=Xf#I|-u7JiWdLH|9yQbL4MO%XMpXPP-wF6oFOIjJV)n?xlNxRp4_l*$buHOUm z){MqMk{mof^(}Q@yBJ4i%Zcat=i^V&XXuB?gYnauofuH<14O9+r&f)D8=qnU#%Z9^ zvU!m5=M4->I!wPjZl>+t9Yk(oE

K!%NR~rE@aCV}Z}`tMe+z@cflmMu=a7 z2BAsN#xl*l2QVc19k>e@V*P7<-r?-KxRbG_q1`5;;CofvW4Y~kO*viUBX=1u7&VJJ z%O%vrIT@_)HgzHN{=$pov? z5@6y%cVZt?$}Q5p0c8n0v5}odvm2B!JXlp6wsQmS&p5%=Qdh+;!)qjV`)F~7MkG9C zU8kIen<$@RGU!bXzLKL?^QXUPV`KM_TKFm8C6eMnU%W>qGGxb6;j zzgab%ejx`Y%=86LRy=i)cc9FJPG~XNOdKOW(cw=+;LG$$^!PGwG<|=PDo*_kt$M<6 zM$a{9x>i8S#gpMmL!}E%H=4nKQ=1?$J_@E(T%$(IEU@6P8$FZ7=DiwTNb~(K!i{)8aND<=IvElSh`2#+ zp504CnJ>uWrYuxcq*WghNnkhNt_{_)$@X0 zQ?Y{i>$Gv4O(T95v&3aH65x!WjlK$D=o0&RFt^bFmp55r)aeM^7VH4tn$74x*#%My z9?&&4ThQlOepy_(4dhmihgTb};-%?D_#wHI4sthyfEl(_IBEx}d_M}GPanYh&>l#u zU+jcH^#;&5GeTS;9ESPqop@)6F;4QRCK)%c(;<(yi_ZJ+fV#03F!Fg3_?B5 zm-WBG0^L~L=k`cMG>eEz`$RYpqYLLUro-p(I6RW;f^yTwP}3`8v7+V%y_T~AV`iKH zhhR^bl->d3{-~uI^Jl|0)xFg0_oKuvas`c%%jUk;auqFwi=;?&iu#a2U|mx{J4avQ z+8+&rt(T8N4^`sXrPR`y0qJDt&|Ua4MU{Fk8jhZ2+hAJ5VDXW6`sg%8A1HCf3*L?7 zrIIyWHS!geQ&kWbEZ76b_H09OPXi3-vf?S%-p5>y0d$sxp(vn;`g$UEya>V1N9-Uz zS^)k}{-7rXBSHLi3Hp8tqb(VkdRXFd^6cn8pMbpo8{1g>uukb-#( zw%a&Dp~g~bTWlxd4~xZ5ReIb9>yD5CTtCs3%;+++B16=jpNZajn^BpoNM@WFhqFEo z5@!ol(RG(L*lj$D*Yg)az^Os@ug5IK&8lC>{j_RP-i}7HoEwkNho?hOcqYE}OT-O- zguq&tBPiE)3i{mlxFzVH%%|a)#%)%iaA3hRNK?p$0U37qG(L;G_RkiLzbdA^>vVWQ zmK4C z;6eKa9BF338y;)`>@TWl6&g>6X&y$C*c6dSOAo%3oadsACN&H2;vW5a2r|kJ;kprJ zq%63ZtZCQ6gSUEckGKVQ-W?OUdF`QM?K*t9CQm& z>qVa&*~G3X7Dha%w0|?R64u?Wpzga$fa|ms^R_C8Zxpk*78}IS=&Qu@eRU9jn^i&1 zbyV6ndLgoJ6Y%(HA2@$jfI0JHF>(HN{Kah{Z78lMo1%w7tLFr=z`BW=J7xo`?G&0` zG{=SA@;K#{2b3yjqD|m^k+bJ22p4RpW6l}D@nu=aS^W-Ig{wf@!tInyi-)xjC3vYi z8fOQ5phx7BAUHS`FE`l1!gGh{r_u^^S$2df)@btH4{bwjch zlV0;w7@C?cN+`~OfCG-8Q9^Ljvn}Y?`IJW2)YD^DqcPoTHl29t8fl7GhWwUO^i!8D z7$=%TPq-tl=)Fk5$Ap;woP%?A8lz5KBRs2L241t-@QaNf?DI&(ysX>wM8gOyUF=EB z&2B-#*cE8#aGey-Cl!KSVUu}t3(-mZanCF z^mXh_j8pn2_Z3V|hRcC9nDCP&x@*hf9zPDw^c;$ZPqc9(TdQeTT`zZ*hA+_evEn~> zOvR;IfufF-a?z4unqZSBkGSX#+2pjGf}Vu#dt!x^i$`GPP-iHsN`+hUM))n(35*|& z=B1B+3ZFrn=BcXEPA>si<(Wdz{cJMcs~o*YO{Mi6u~5ZP1l!h3q8;W*qn2tB{kcBS zwP+>Qt;^tU zG%XbwU;2ctJN=rCl6%Xc)5>HTT9tdD%ju-k1Yv!MQ}; z?;!PS+sajwe1qAEmuOyNJqB_kvCtv~*9u$9!b;lFsMV5YDUReVHG0Ce9+8P>qZV;< zbbPs~sVcnn?g|*(y@ULkuS+b=0e5bq0lu+2xFBNL#%K%1b#gaCr}B8?|z5LV_Ufi9mDB?pcgd4zEq^`+-iRp zs_E0Sxv=?uC|P;c5FIkZaFtM(j7_V83aZMz+cOpAezUi~`*8@qbcz>!+4306FBVdH z>;HiB&x+!aDj!JDnLXUyP3Ixw=sL(tEF*@k`E;W8NKn7!2+@{9dCGGY``mY}vG>@Z ze{x^1!x>l;-h^7pBjDp4FWmdJmF!B5KzZRt8}K+Z&@#Evgko{1*=@JBPbOA?U6mU+wP@vYEQw$^z-!BEqpA!RgOk+lW9#( zGQO&rjZx`Opkq`GIoO^878;q*k>d%$>@G}Y&w`@3m!zZPIes4J1TQ8uli)`yp{i~n zM7Ob^>jsLp1r8L?SzAb|o*$$W!HZts-HPMe2V?LuU2$$n0?fZ>MVoJ(z!&w}sLnaf zm3yE`<1+Tb{EcTJ#P9+M&kn@9f;O@`)=Aog1fhkw4~)>^VXWIth#z4`8){78+6#nm zZyoRwa`4gi49Nba0rne9n-%Zf}qJdoCQMN zYC3#}9*GY96OuN^;?FI1piz7Ue-Z~lrQviuI_Uz6KP|wnhHW_bOQooI(=C`_a}s-b zX)w^)0?&v?;=)IM_`u>Scks1cSXwX?23McNDG@_?Z6hoCnvO$>;@p31d4)}S`~fK* z{QQYpf*pqsI4p}!U{cSz^Ga9;n9-R#Byq*&0>@F!%+~fSe)jKM`Him!F(;f>@~6a{ zk_6Yv^N&A`C_fZ8++mjE0Y3lQHs;Hg8a}Q!6eLCH@gFzb;oImbmYch91j9b3^KYt^ zFmnWoWJ4h%X}BYBU`9#!2fe!_hEt|E1W3#nwF{5<6O%48dwxrC;1y)^yQb=kjnj%cVqPcUhd!Aj*86;inff-5&nZ(DJm1^S#6m2uoADQn*k|h?IPS!s8SLPY(=>n? zGi5sC;}Xq0%g=CNo*m(Pc#UV?m#7KeB=2@;D^KEAe0{}CRTDA~zojyD#v*=O+jC~t z_(r}#R1#DCn}wjj$*5d+sGP2jg6v~~^dS4zz?Re1X0g7@vMy07%Noo6vTVCAm1Vo& z{<3V-C6#4+kN&dkZ7h{#FV_CD?6Dw~W&eWyvdkk=SvK9jWqoZGc8{v`npnE}+OY%s z;>}upagPDJ`^T8haeexG%f9%X-QTekoZzGuw%R8&$UDq0&_CFRJ%q(p87rqF=V$RF zu>28N{RlWe0_z`v&G$fYrH`-wN|tuMoUXp^MiUi%wu-Kek_t;##$(G#b?x`6+C8c! zD)!&+@%J09%^LsL9siz5XD8i)v)FPhDaeVft^Ili+ni+*JFq{^vOg_$pcE%t8yyAN zLdkG_S$`bk#MaS1FjI!xtkoZ9)gQN6ONx`Nsg8ndfn~V9tUvCa3|ILVuE(H1j`JPY zp{m`jYSLpM#mT&&qaZVe4A+Ob-T&%3$4PNA59uh#3?;+$W&LsS z&QjcQN7-cGTlUA<^vAuol;ULG(@~HaQHJZw`s0qva5w+Ly|e4XeP8na#CLX5qRh5B z3NjDNh<#aq;$<1JS4PxMpCR>ziHi1<8Cp_zPtwr|2@Np!S?weBr1@X=G>`BP4O+e-H3l25<+XhCK=9V0D zOQZ3=Ygeok8VP-^Ep0iL7Sn__zEe5Y{(d&oI2JTu#E2qClE(tsGo_aYKLCQ1nbgaJajN>OcB{ zimXc@u|l0^&R;3KTUAPx<8KL1UtJIB1MWfD!;`hI4Qj|~4>1-mD~IXV;z_!>Jr?_3 zff0wtV)x{kct8Gkre3rfI*BHn8XZ73+D~AVK29cS?~=gx!((zYOdImX5aAx-U9=bb z!J}P)^zBP;7^7{Dp5t=CP3S~6&#*wd9y_9-_!#e$+o0p~S{zwg=Jq-FBt3QY5_(tV zf=0n$C^Jy19p82sXPh}uV;Q^-55L(AY4T-cx0Z%zoBUj8WIcmRCA}o~xthp-{uCbJ z=z)?#BfYRanTEf7N8abep`LdTUN^L(HKzlp$&?flsZ+v7&)re0ess5Y$woXZc3D37f_WBYu+j@i-bUKh z$)ni`ugR%OA)4-{aG>@Q{cFCi=<>}aI9ae3*s~Me@`^m*MS=>2C)=_6o~r2a(&g|& zvmMVa;wL(`#0L+jr$F#>cWTfTKr=6-LrLaA^z*ENU%MGRZTFHUe{7-tg{d&G=x6eB zE?{6rC$oI*JMtu@gswcQga=X#LH*A*a_hxHTB$ccGy~KzZjBAPcMQPz&hco<+ld~E zvoU;PI_#e@37+1~5l*qTprZw0r_|2lc^*ng{3{SO=VKLP+>5rj7G<(2}v5ya}tF z>4zgh%$cM(I%G&aE@zzqKlyQ(d$ycBxBpQTS)(j^aQ6TSY)!%i4;mrm;yU7bCW$o6 z+6SRlZ*Ww=IoS2m5JIvC<7BsOU=!wzmt2kE{JKFf+f*M?N*ChUDW2$??G5?!$KeCB zG)7#njH_8|P_?fTx?g;vtBS19d8ZNDP6?rQ7i-Dz>D8qBV;9wUnM2Q14MdyBRWNNQ zhcV|@;b(3w{l|TK3VOW%&3wZ?Zbb7nk+r@P<_Z(`CgPJkGZD|;2Hp>4FmZ4zc>V4M zd2cIe>lSrXdGZ$qt~(7k%1hya(hwZL_>r>f$+)N@20wVc5*m1JfqIWp>?kGJc0iXm zkDX0k4ZDNYS{8V7XaodxH8EX}2a96OW)tP9s!YnE9Lnb(BS)umaZ}W3C`xR^>C!E6 zt8fF*seSmC*Fhbg3=(}xd5VAS$b`B+W0;mT3&xHuWIlD=BpG3rq`0CQ77p#B$3>5c z;)iYcdHxMJr>=(1%k<&b8VqIBvuh#YymJ)0|B zM=$DuC)az0#?2h6>bH-6U>4#FZ+V!x=uGVg3n4WSw4!;ZnJ6c4Gf1{xfr0mIu(@SA zUH14eW)c+|nsSeBm}L!D&s-vD(*gTN3`Sv@HjXcr1AFZ_yb+^c`!LIb^!?Uwnj3MP zdaCY2^R|zS-9}BOJ;(%xsFq+1Uz4c$ZN>ymMO0gtOdeNi!vIA^k%~?=T2(B?U-#(q z#BuWIqL_?B)*m5l`wqiEZ9VM0&1Lk`iWrxm`*0-h5R?`C0deEZF>emR?iuEy{frNu zJemoQQn$l}S}vsDIfuUUQlNe;2Q{L9#^lR=U>CNRNmEe(TV58m+h&Iklg3fgsYA}w0_7UA zJFo-Vy-bBMVHId8v|zc$jvlYi!xTKheM4 z7SxWYii(=A6QSWHyn0xZ_*&l~S?i6lbe;odpO}lglalc+<>2Oy&BDQzolMrPGHmh) zW#qJ;k$r>u=-`M=c)DvFaA!UNPRmv@{FO5A=Cev#-wjyfz6c-r$cx(EXoxr)Txi)c z1AIKr9~vTjsc2C-uaINsRglrA(ImygXl02UT&Atq}kDn)qHF(0KaU)hAO+j8h_ zlSp!Z!UQNc^?;k&2E?Rz1-aJdfz!A_XsWLzTJB#&hkm||9`c)`^lBeUB2rX_v__=X!|dhO-cK!-{vuuo{yF zS>VIOCye&fSnABa0cwGp;C=Ze%-F<0TbC*vRB?uPR|i0RkA;W_snkwe9S@{`r0>cE zB;@sdT;=x^M%Kr|`Xmu~knBy2-&Zl^Z95@Ls|tRo3l=_lf0#aRvZE<%3!JHFgFP3P zLVQa$=EzA%QRG{<*C&(l9kUZTc8_tR-6V{9ks-`qU;qn})i9{^DPdKL$hlT)NOc~G zuLGlrda(=hGG!Fqy4(mP5gS2$SUWlu)-f}WRDz+#I`}*)3YPA1#g5hpX1QY+TnU(n z>qnd?!FlbZ$hQF%H}%rgIsTxpeuc4?DB-?U4UBs1K;iFf6Q)b}jwTIlVIJL3VuE_N z;XEV2D!+@geoG#7yje%Ql{XV`Itk{tEOA!a?_irCFRV`Wg^i8I*ld3hb%sxciAVpW z%e7QQYrGG`gU_{eYD6y--dih-S!#^*_(e31r?hSR1#FwCh9kPPMgQ!51xMP-wf@b0 z3!>)0G~$_-Hdp zx?zfs_=E9CGg6-^BeAn}vvA_LHeS#-CLWnQjF@Nw$GE8sJ2wed&9#9twG`aubDe&i zwu0pEABFF{*TUJ~cTme&IXE;R0A7}*(kN&BH*@?&8+Y|q(&tBZbz!t+~j$z{db zyqsh@Yn(q8`Zm+-ZK}L!x?%94NDd0}cN358wrE`4f)d^#dT47l!dy07ey|pI!TW$a|iZx6`Y6KrvzVmaPhLwLEi04Av>5Pv5bQf$*O%x*25KcNmQKdqpvk3E1%UB_W> z+Aw%kGLNjhq6sanzmg2qLA(hIQmFfd8fNLxw`7BA^Y@eYX<<#V(!aScHOb0-*C;2! zj>0jLxka6P(JLiEW5r9p)z&5a`8O8`x?3g*s^lMv^Ydm4mIX}^7_DB;@5r0M53;He zsJ}fVF%4;yRF>`KnXkVlp0~h2(p&PAz$0L-!0V_+UE_W~L5A9SetfGD-#IW<^7*5H z*L1){99+{X@chF_B4=|*Fv3YgaxQ6=q|{JPu-r6R;!!Zj{Yssg#L}TsP{r*Q>{-l` z^zJmTOS;_2JJXm}_vGq4iO{B2l9hi+++v?AxMO!p;uO73lHhhi(6D%b`^OeX_t?Vg zI3%cqKeow1AotluoY*WBbo)6-Hm&?1KDk|+*W)`%;-GKpuG>-|u<&^=@+!C@X}W!z zFZNZbJ9X4f;Pl+Iu2(&kzlbds4gORoN%BtP5AK;HIZ}L0qP>YD2=a83nCfm8SH6%) z^3Q5Wel$xJPs`C2D`>uvtStH@czaadJ;O>~9LhZ=Ft}k zE+n>E()~zL6udH$Z{{>v&~(X2a`ruoF9{i5=kvKjoLOil{_Eo`!OwR2f;CKtyJzAg z$+OD}b?MqW1osS!#CBG8g53c+f~AIy;-f!%)LC@Qs5>#z+TG7jP24u>sraE?saR=1 zr$lEB3hpl3E3xn$A=q`QLE@=qAPCduNiLjFaBmW>kZ=-G#QDGLih*d=or|m%M~2RJ z&&ruDsed<*&*bqXFDLWre1ew?BvxX+*{th)xp}t)Q3b06O~G>oId?Q9csxMT7+_d8 z^2X1C2*+y@#K+6lD+B(l6P&0=B$9hsF9T%bvQVvh3lszbw0_ zN@dw~yuU2Fa7ks^U8%n;JGG>;>=5iP%eE?6Z|Og&zbx~PRF+NW-?9-$mHkdlIwqFB z5hqEV-S;i-(`EO5i{BW3*I!tEk9)Y^rk4D^*;MhfT<3bmNM+LLBSZvjy z@_O>2_TK{Tw_wz_VDz`Z;alMNHBgES4hfHBaSqDs8|kkcCVg?Bs_&?*%F>td*z!`H z)1$_Dr#4L0?du$Wop26o_?H>~nn{11^c-TbyM*8Y&}l4mkigZ+aEXTEAE*Zr&mqdB*n?rR8LX1z%tzT ztUs<*hLe23y)o{O8{L0&Z;YilnHTgFWyX-TIGKm^ z6lI2z;l5}6aht|Tal5=_KemV4ALrN~*Ta>KPU<~9MVS%5jP9STKW>){*YE|`J@z~9 z>yq~;c8`@3WwzB*lzCW2{GRnEo{$lHWkgP%r_>w6R5>?22TI*NLQijT?0nncCBafp z+J3R8?b7hrxP^d311;tSQu9D*xRrB64xCtXFhGq}0)o z(q?I0_z3fFNORZ3RjWpPFXjIC(oz4tbo75Ob@=b4j{mvz%jWR^te&%qr8oZXRUH`> zYx=hhOu4q)(Y9RE;UNhNBg0I?LL9g*qq+8D!W=`KM>~XvI*u7_ALin09~u@M&UNC3 dJJ`E~g@puL>8~_mvHthNQd45FCQ0A9{~rcN%{c%7 diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrixSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrixSuite.scala index 1c8b36af299d..ceebcfd41f7a 100644 --- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrixSuite.scala +++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrixSuite.scala @@ -22,6 +22,7 @@ import ai.rapids.cudf.Table import org.scalatest.funsuite.AnyFunSuite import ml.dmlc.xgboost4j.java.CudfColumnBatch +import ml.dmlc.xgboost4j.scala.spark.Utils.withResource class QuantileDMatrixSuite extends AnyFunSuite { @@ -30,10 +31,14 @@ class QuantileDMatrixSuite extends AnyFunSuite { val label1 = Array[java.lang.Float](25f, 21f, 22f, 20f, 24f) val weight1 = Array[java.lang.Float](1.3f, 2.31f, 0.32f, 3.3f, 1.34f) val baseMargin1 = Array[java.lang.Float](1.2f, 0.2f, 1.3f, 2.4f, 3.5f) + val group1 = Array[java.lang.Integer](1, 1, 7, 7, 19, 26) val label2 = Array[java.lang.Float](9f, 5f, 4f, 10f, 12f) val weight2 = Array[java.lang.Float](3.0f, 1.3f, 3.2f, 0.3f, 1.34f) val baseMargin2 = Array[java.lang.Float](0.2f, 2.5f, 3.1f, 4.4f, 2.2f) + val group2 = Array[java.lang.Integer](30, 30, 30, 40, 40) + + val expectedGroup = Array(0, 2, 4, 5, 6, 9, 11) withResource(new Table.TestBuilder() .column(1.2f, null.asInstanceOf[java.lang.Float], 5.2f, 7.2f, 9.2f) @@ -42,21 +47,27 @@ class QuantileDMatrixSuite extends AnyFunSuite { withResource(new Table.TestBuilder().column(label1: _*).build) { y_0 => withResource(new Table.TestBuilder().column(weight1: _*).build) { w_0 => withResource(new Table.TestBuilder().column(baseMargin1: _*).build) { m_0 => - withResource(new Table.TestBuilder() - .column(11.2f, 11.2f, 15.2f, 17.2f, 19.2f.asInstanceOf[java.lang.Float]) - .column(1.2f, 1.4f, null.asInstanceOf[java.lang.Float], 12.6f, 10.10f).build) { X_1 => - withResource(new Table.TestBuilder().column(label2: _*).build) { y_1 => - withResource(new Table.TestBuilder().column(weight2: _*).build) { w_1 => - withResource(new Table.TestBuilder().column(baseMargin2: _*).build) { m_1 => - val batches = new ArrayBuffer[CudfColumnBatch]() - batches += new CudfColumnBatch(X_0, y_0, w_0, m_0) - batches += new CudfColumnBatch(X_1, y_1, w_1, m_1) - val dmatrix = new QuantileDMatrix(batches.toIterator, 0.0f, 8, 1) - assert(dmatrix.getLabel.sameElements(label1 ++ label2)) - assert(dmatrix.getWeight.sameElements(weight1 ++ weight2)) - assert(dmatrix.getBaseMargin.sameElements(baseMargin1 ++ baseMargin2)) + withResource(new Table.TestBuilder().column(group1: _*).build) { q_0 => + withResource(new Table.TestBuilder() + .column(11.2f, 11.2f, 15.2f, 17.2f, 19.2f.asInstanceOf[java.lang.Float]) + .column(1.2f, 1.4f, null.asInstanceOf[java.lang.Float], 12.6f, 10.10f).build) { + X_1 => + withResource(new Table.TestBuilder().column(label2: _*).build) { y_1 => + withResource(new Table.TestBuilder().column(weight2: _*).build) { w_1 => + withResource(new Table.TestBuilder().column(baseMargin2: _*).build) { m_1 => + withResource(new Table.TestBuilder().column(group2: _*).build) { q_2 => + val batches = new ArrayBuffer[CudfColumnBatch]() + batches += new CudfColumnBatch(X_0, y_0, w_0, m_0, q_0) + batches += new CudfColumnBatch(X_1, y_1, w_1, m_1, q_2) + val dmatrix = new QuantileDMatrix(batches.toIterator, 0.0f, 8, 1) + assert(dmatrix.getLabel.sameElements(label1 ++ label2)) + assert(dmatrix.getWeight.sameElements(weight1 ++ weight2)) + assert(dmatrix.getBaseMargin.sameElements(baseMargin1 ++ baseMargin2)) + assert(dmatrix.getGroup().sameElements(expectedGroup)) + } + } + } } - } } } } @@ -64,15 +75,4 @@ class QuantileDMatrixSuite extends AnyFunSuite { } } } - - /** Executes the provided code block and then closes the resource */ - private def withResource[T <: AutoCloseable, V](r: T)(block: T => V): V = { - try { - block(r) - } finally { - r.close() - } - } - } - diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala index 9d2b41faadea..4b7e7e34b8ef 100644 --- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala +++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala @@ -20,14 +20,66 @@ import java.io.File import scala.collection.mutable.ArrayBuffer -import ai.rapids.cudf.{CSVOptions, DType, Schema, Table} -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.types.{FloatType, StructField, StructType} +import ai.rapids.cudf.Table +import org.apache.spark.ml.linalg.DenseVector +import org.apache.spark.sql.{Dataset, SparkSession} +import ml.dmlc.xgboost4j.java.CudfColumnBatch +import ml.dmlc.xgboost4j.scala.{DMatrix, QuantileDMatrix, XGBoost => ScalaXGBoost} import ml.dmlc.xgboost4j.scala.rapids.spark.GpuTestSuite +import ml.dmlc.xgboost4j.scala.spark.Utils.withResource class GpuXGBoostPluginSuite extends GpuTestSuite { + test("params") { + withGpuSparkSession() { spark => + import spark.implicits._ + val df = Seq((1.0f, 2.0f, 1.0f, 2.0f, 0.0f, 0.0f), + (2.0f, 3.0f, 2.0f, 3.0f, 1.0f, 0.1f), + (3.0f, 4.0f, 5.0f, 6.0f, 0.0f, 0.1f), + (4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 0.1f), + (5.0f, 6.0f, 7.0f, 8.0f, 0.0f, 0.1f) + ).toDF("c1", "c2", "weight", "margin", "label", "other") + val xgbParams: Map[String, Any] = Map( + "max_depth" -> 5, + "eta" -> 0.2, + "objective" -> "binary:logistic" + ) + val features = Array("c1", "c2") + val estimator = new XGBoostClassifier(xgbParams) + .setFeaturesCol(features) + .setMissing(0.2f) + .setAlpha(0.97) + .setLeafPredictionCol("leaf") + .setContribPredictionCol("contrib") + .setNumRound(1) + .setDevice("cuda") + + assert(estimator.getMaxDepth === 5) + assert(estimator.getEta === 0.2) + assert(estimator.getObjective === "binary:logistic") + assert(estimator.getFeaturesCols === features) + assert(estimator.getMissing === 0.2f) + assert(estimator.getAlpha === 0.97) + assert(estimator.getDevice === "cuda") + + estimator.setEta(0.66).setMaxDepth(7) + assert(estimator.getMaxDepth === 7) + assert(estimator.getEta === 0.66) + + val model = estimator.train(df) + assert(model.getMaxDepth === 7) + assert(model.getEta === 0.66) + assert(model.getObjective === "binary:logistic") + assert(model.getFeaturesCols === features) + assert(model.getMissing === 0.2f) + assert(model.getAlpha === 0.97) + assert(model.getLeafPredictionCol === "leaf") + assert(model.getContribPredictionCol === "contrib") + assert(model.getDevice === "cuda") + } + } + test("isEnabled") { def checkIsEnabled(spark: SparkSession, expected: Boolean): Unit = { import spark.implicits._ @@ -120,6 +172,7 @@ class GpuXGBoostPluginSuite extends GpuTestSuite { } } + // test distributed test("build RDD Watches") { withGpuSparkSession() { spark => import spark.implicits._ @@ -179,10 +232,44 @@ class GpuXGBoostPluginSuite extends GpuTestSuite { } } - test("build RDD Watches with Eval") { + // must set num worker to 1 + test("build RDD Watches with group") { withGpuSparkSession() { spark => import spark.implicits._ + val df = Seq( + (1.0f, 2.0f, 1.0f, 2.0f, 0.0f, 11, 0.0f), + (2.0f, 3.0f, 2.0f, 3.0f, 1.0f, 11, 0.1f), + (3.0f, 4.0f, 5.0f, 6.0f, 0.0f, 30, 0.1f), + (4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 30, 0.1f), + (5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 41, 0.1f) + ).toDF("c1", "c2", "weight", "margin", "label", "group", "other") + + val features = Array("c1", "c2") + val classifier = new XGBoostRanker() + .setNumWorkers(1) + .setGroupCol("group") + .setFeaturesCol(features) + .setDevice("cuda") + + val rdd = classifier.getPlugin.get.buildRddWatches(classifier, df) + val result = rdd.mapPartitions { iter => + val watches = iter.next() + Iterator.single(watches.datasets(0).getGroup) + }.collect() + + val groups: ArrayBuffer[Int] = ArrayBuffer.empty + + for (row <- result) { + groups.append(row: _*) + } + assert(groups.sorted === Array(0, 2, 4, 5).sorted) + } + } + + test("build RDD Watches with Eval") { + withGpuSparkSession() { spark => + import spark.implicits._ val train = Seq( (1.0f, 2.0f, 1.0f, 2.0f, 0.0f, 0.0f), (2.0f, 3.0f, 2.0f, 3.0f, 1.0f, 0.1f) @@ -244,18 +331,289 @@ class GpuXGBoostPluginSuite extends GpuTestSuite { } } + test("transformed schema") { + withGpuSparkSession() { spark => + import spark.implicits._ + val df = Seq( + (1.0f, 2.0f, 1.0f, 2.0f, 0.0f, 0.0f), + (2.0f, 3.0f, 2.0f, 3.0f, 1.0f, 0.1f), + (3.0f, 4.0f, 5.0f, 6.0f, 0.0f, 0.1f), + (4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 0.1f), + (5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 0.1f) + ).toDF("c1", "c2", "weight", "margin", "label", "other") + + val estimator = new XGBoostClassifier() + .setNumWorkers(1) + .setNumRound(2) + .setFeaturesCol(Array("c1", "c2")) + .setLabelCol("label") + .setDevice("cuda") + + assert(estimator.getPlugin.isDefined && estimator.getPlugin.get.isEnabled(df)) + + val out = estimator.train(df).transform(df) + // Transform should not discard the other columns of the transforming dataframe + Seq("c1", "c2", "weight", "margin", "label", "other").foreach { v => + assert(out.schema.names.contains(v)) + } + + // Transform for XGBoostClassifier needs to add extra columns + Seq("rawPrediction", "probability", "prediction").foreach { v => + assert(out.schema.names.contains(v)) + } + assert(out.schema.names.length === 9) + + val out1 = estimator.setLeafPredictionCol("leaf").setContribPredictionCol("contrib") + .train(df) + .transform(df) + Seq("leaf", "contrib").foreach { v => + assert(out1.schema.names.contains(v)) + } + } + } - test("XGBoost-Spark should match xgboost4j") { + private def checkEqual(left: Array[Array[Float]], + right: Array[Array[Float]], + epsilon: Float = 1e-4f): Unit = { + assert(left.size === right.size) + left.zip(right).foreach { case (leftValue, rightValue) => + leftValue.zip(rightValue).foreach { case (l, r) => + assert(math.abs(l - r) < epsilon) + } + } + } + + Seq("binary:logistic", "multi:softprob").foreach { case objective => + test(s"$objective: XGBoost-Spark should match xgboost4j") { + withGpuSparkSession() { spark => + import spark.implicits._ + + val numRound = 100 + var xgboostParams: Map[String, Any] = Map( + "objective" -> objective, + "device" -> "cuda" + ) + + val (trainPath, testPath) = if (objective == "binary:logistic") { + (writeFile(Classification.train.toDF("label", "weight", "c1", "c2", "c3")), + writeFile(Classification.test.toDF("label", "weight", "c1", "c2", "c3"))) + } else { + xgboostParams = xgboostParams ++ Map("num_class" -> 6) + (writeFile(MultiClassification.train.toDF("label", "weight", "c1", "c2", "c3")), + writeFile(MultiClassification.test.toDF("label", "weight", "c1", "c2", "c3"))) + } + + val df = spark.read.parquet(trainPath) + val testdf = spark.read.parquet(testPath) + + val features = Array("c1", "c2", "c3") + val featuresIndices = features.map(df.schema.fieldIndex) + val label = "label" + + val classifier = new XGBoostClassifier(xgboostParams) + .setFeaturesCol(features) + .setLabelCol(label) + .setNumRound(numRound) + .setLeafPredictionCol("leaf") + .setContribPredictionCol("contrib") + .setDevice("cuda") + + val xgb4jModel = withResource(new GpuColumnBatch( + Table.readParquet(new File(trainPath)))) { batch => + val cb = new CudfColumnBatch(batch.select(featuresIndices), + batch.select(df.schema.fieldIndex(label)), null, null, null + ) + val qdm = new QuantileDMatrix(Seq(cb).iterator, classifier.getMissing, + classifier.getMaxBins, classifier.getNthread) + ScalaXGBoost.train(qdm, xgboostParams, numRound) + } + + val (xgb4jLeaf, xgb4jContrib, xgb4jProb, xgb4jRaw) = withResource(new GpuColumnBatch( + Table.readParquet(new File(testPath)))) { batch => + val cb = new CudfColumnBatch(batch.select(featuresIndices), null, null, null, null + ) + val qdm = new DMatrix(cb, classifier.getMissing, classifier.getNthread) + (xgb4jModel.predictLeaf(qdm), xgb4jModel.predictContrib(qdm), + xgb4jModel.predict(qdm), xgb4jModel.predict(qdm, outPutMargin = true)) + } + + val rows = classifier.train(df).transform(testdf).collect() + + // Check Leaf + val xgbSparkLeaf = rows.map(row => row.getAs[DenseVector]("leaf").toArray.map(_.toFloat)) + checkEqual(xgb4jLeaf, xgbSparkLeaf) + + // Check contrib + val xgbSparkContrib = rows.map(row => + row.getAs[DenseVector]("contrib").toArray.map(_.toFloat)) + checkEqual(xgb4jContrib, xgbSparkContrib) + + // Check probability + var xgbSparkProb = rows.map(row => + row.getAs[DenseVector]("probability").toArray.map(_.toFloat)) + if (objective == "binary:logistic") { + xgbSparkProb = xgbSparkProb.map(v => Array(v(1))) + } + checkEqual(xgb4jProb, xgbSparkProb) + + // Check raw + var xgbSparkRaw = rows.map(row => + row.getAs[DenseVector]("rawPrediction").toArray.map(_.toFloat)) + if (objective == "binary:logistic") { + xgbSparkRaw = xgbSparkRaw.map(v => Array(v(1))) + } + checkEqual(xgb4jRaw, xgbSparkRaw) + + } + } + } + + test(s"Regression: XGBoost-Spark should match xgboost4j") { + withGpuSparkSession() { spark => + import spark.implicits._ + + val trainPath = writeFile(Regression.train.toDF("label", "weight", "c1", "c2", "c3")) + val testPath = writeFile(Regression.test.toDF("label", "weight", "c1", "c2", "c3")) + + val df = spark.read.parquet(trainPath) + val testdf = spark.read.parquet(testPath) + + val features = Array("c1", "c2", "c3") + val featuresIndices = features.map(df.schema.fieldIndex) + val label = "label" + + val numRound = 100 + val xgboostParams: Map[String, Any] = Map( + "device" -> "cuda" + ) + + val regressor = new XGBoostRegressor(xgboostParams) + .setFeaturesCol(features) + .setLabelCol(label) + .setNumRound(numRound) + .setLeafPredictionCol("leaf") + .setContribPredictionCol("contrib") + .setDevice("cuda") + + val xgb4jModel = withResource(new GpuColumnBatch( + Table.readParquet(new File(trainPath)))) { batch => + val cb = new CudfColumnBatch(batch.select(featuresIndices), + batch.select(df.schema.fieldIndex(label)), null, null, null + ) + val qdm = new QuantileDMatrix(Seq(cb).iterator, regressor.getMissing, + regressor.getMaxBins, regressor.getNthread) + ScalaXGBoost.train(qdm, xgboostParams, numRound) + } + + val (xgb4jLeaf, xgb4jContrib, xgb4jPred) = withResource(new GpuColumnBatch( + Table.readParquet(new File(testPath)))) { batch => + val cb = new CudfColumnBatch(batch.select(featuresIndices), null, null, null, null + ) + val qdm = new DMatrix(cb, regressor.getMissing, regressor.getNthread) + (xgb4jModel.predictLeaf(qdm), xgb4jModel.predictContrib(qdm), + xgb4jModel.predict(qdm)) + } + + val rows = regressor.train(df).transform(testdf).collect() + + // Check Leaf + val xgbSparkLeaf = rows.map(row => row.getAs[DenseVector]("leaf").toArray.map(_.toFloat)) + checkEqual(xgb4jLeaf, xgbSparkLeaf) + + // Check contrib + val xgbSparkContrib = rows.map(row => + row.getAs[DenseVector]("contrib").toArray.map(_.toFloat)) + checkEqual(xgb4jContrib, xgbSparkContrib) + + // Check prediction + val xgbSparkPred = rows.map(row => + Array(row.getAs[Double]("prediction").toFloat)) + checkEqual(xgb4jPred, xgbSparkPred) + } + } + + test("Ranker: XGBoost-Spark should match xgboost4j") { withGpuSparkSession() { spark => + import spark.implicits._ - val cols = Array("c0", "c1", "c2", "c3", "c4", "c5") + val trainPath = writeFile(Ranking.train.toDF("label", "weight", "group", "c1", "c2", "c3")) + val testPath = writeFile(Ranking.test.toDF("label", "weight", "group", "c1", "c2", "c3")) + + val df = spark.read.parquet(trainPath) + val testdf = spark.read.parquet(testPath) + + val features = Array("c1", "c2", "c3") + val featuresIndices = features.map(df.schema.fieldIndex) val label = "label" + val group = "group" + + val numRound = 100 + val xgboostParams: Map[String, Any] = Map( + "device" -> "cuda", + "objective" -> "rank:ndcg" + ) + + val ranker = new XGBoostRanker(xgboostParams) + .setFeaturesCol(features) + .setLabelCol(label) + .setNumRound(numRound) + .setLeafPredictionCol("leaf") + .setContribPredictionCol("contrib") + .setGroupCol(group) + .setDevice("cuda") - val table = Table.readParquet(new File(getResourcePath("/binary.train.parquet"))) - val df = spark.read.parquet(getResourcePath("/binary.train.parquet")) + val xgb4jModel = withResource(new GpuColumnBatch( + Table.readParquet(new File(trainPath)))) { batch => + val cb = new CudfColumnBatch(batch.select(featuresIndices), + batch.select(df.schema.fieldIndex(label)), null, null, + batch.select(df.schema.fieldIndex(group))) + val qdm = new QuantileDMatrix(Seq(cb).iterator, ranker.getMissing, + ranker.getMaxBins, ranker.getNthread) + ScalaXGBoost.train(qdm, xgboostParams, numRound) + } + + val (xgb4jLeaf, xgb4jContrib, xgb4jPred) = withResource(new GpuColumnBatch( + Table.readParquet(new File(testPath)))) { batch => + val cb = new CudfColumnBatch(batch.select(featuresIndices), null, null, null, null + ) + val qdm = new DMatrix(cb, ranker.getMissing, ranker.getNthread) + (xgb4jModel.predictLeaf(qdm), xgb4jModel.predictContrib(qdm), + xgb4jModel.predict(qdm)) + } + + val rows = ranker.train(df).transform(testdf).collect() + + // Check Leaf + val xgbSparkLeaf = rows.map(row => row.getAs[DenseVector]("leaf").toArray.map(_.toFloat)) + checkEqual(xgb4jLeaf, xgbSparkLeaf) + + // Check contrib + val xgbSparkContrib = rows.map(row => + row.getAs[DenseVector]("contrib").toArray.map(_.toFloat)) + checkEqual(xgb4jContrib, xgbSparkContrib) + // Check prediction + val xgbSparkPred = rows.map(row => + Array(row.getAs[Double]("prediction").toFloat)) + checkEqual(xgb4jPred, xgbSparkPred) + } + } - df.show() + def writeFile(df: Dataset[_]): String = { + def listFiles(directory: String): Array[String] = { + val dir = new File(directory) + if (dir.exists && dir.isDirectory) { + dir.listFiles.filter(f => f.isFile && f.getName.startsWith("part-")).map(_.getName) + } else { + Array.empty[String] + } } + + val dir = createTmpFolder("gpu_").toAbsolutePath.toString + df.coalesce(1).write.parquet(s"$dir/data") + + val file = listFiles(s"$dir/data")(0) + s"$dir/data/$file" } + } diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala new file mode 100644 index 000000000000..49c790fd0a00 --- /dev/null +++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TrainTestData.scala @@ -0,0 +1,86 @@ +/* + Copyright (c) 2014-2024 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package ml.dmlc.xgboost4j.scala.spark + +import scala.util.Random + +trait TrainTestData { + + protected def generateClassificationDataset( + numRows: Int, + numClass: Int, + seed: Int = 1): Seq[(Int, Float, Float, Float, Float)] = { + val random = new Random() + random.setSeed(seed) + (1 to numRows).map { _ => + val label = random.nextInt(numClass) + // label, weight, c1, c2, c3 + (label, random.nextFloat().abs, random.nextGaussian().toFloat, random.nextGaussian().toFloat, + random.nextGaussian().toFloat) + } + } + + protected def generateRegressionDataset( + numRows: Int, + seed: Int = 11): Seq[(Float, Float, Float, Float, Float)] = { + val random = new Random() + random.setSeed(seed) + (1 to numRows).map { _ => + // label, weight, c1, c2, c3 + (random.nextFloat(), random.nextFloat().abs, random.nextGaussian().toFloat, + random.nextGaussian().toFloat, + random.nextGaussian().toFloat) + } + } + + protected def generateRankDataset( + numRows: Int, + numClass: Int, + maxGroup: Int = 12, + seed: Int = 99): Seq[(Int, Float, Int, Float, Float, Float)] = { + val random = new Random() + random.setSeed(seed) + (1 to numRows).map { _ => + val group = random.nextInt(maxGroup) + // label, weight, group, c1, c2, c3 + (random.nextInt(numClass), group.toFloat, group, + random.nextGaussian().toFloat, + random.nextGaussian().toFloat, + random.nextGaussian().toFloat) + } + } +} + +object Classification extends TrainTestData { + val train = generateClassificationDataset(300, 2, 3) + val test = generateClassificationDataset(150, 2, 5) +} + +object MultiClassification extends TrainTestData { + val train = generateClassificationDataset(300, 4, 11) + val test = generateClassificationDataset(150, 4, 12) +} + +object Regression extends TrainTestData { + val train = generateRegressionDataset(300, 222) + val test = generateRegressionDataset(150, 223) +} + +object Ranking extends TrainTestData { + val train = generateRankDataset(300, 10, 555) + val test = generateRankDataset(150, 10, 556) +} diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XXXXXSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XXXXXSuite.scala deleted file mode 100644 index f98c9614ab68..000000000000 --- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XXXXXSuite.scala +++ /dev/null @@ -1,95 +0,0 @@ -/* - Copyright (c) 2021-2024 by Contributors - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - */ - -package ml.dmlc.xgboost4j.scala.spark - -import org.scalatest.funsuite.AnyFunSuite - -import ml.dmlc.xgboost4j.scala.rapids.spark.GpuTestSuite - -class XXXXXSuite extends AnyFunSuite with GpuTestSuite { - - test("test Gpu XGBoostClassifierSuite") { - // Define the schema for the fake data - - withGpuSparkSession() { spark => - var df = spark.read.parquet("/home/bobwang/data/iris/parquet") - - df.sparkSession.conf.get("spark.rapids.sql.enabled") - - // Select the features and label columns - val labelCol = "class" - - val features = df.schema.names.filter(_ != labelCol) - - // df = df.withColumn("base_margin", lit(20)) - // .withColumn("weight", rand(1)) - - var Array(trainDf, validationDf) = df.randomSplit(Array(0.8, 0.2), seed = 1) - - // trainDf = trainDf.withColumn("validation", lit(false)) - // validationDf = validationDf.withColumn("validationDf", lit(true)) - - // df = trainDf.union(validationDf) - // - // // Assemble the feature columns into a single vector column - // val assembler = new VectorAssembler() - // .setInputCols(features) - // .setOutputCol("features") - // val dataset = assembler.transform(df) - - // val arrayInput = df.select(array(features.map(col(_)): _*).as("features"), - // col("label"), col("base_margin")) - - val est = new XGBoostClassifier() - .setNumWorkers(1) - .setNumRound(100) - // .setMaxDepth(3) - // .setWeightCol("weight") - // .setBaseMarginCol("base_margin") - .setFeaturesCol(features) - .setLabelCol(labelCol) - .setLeafPredictionCol("leaf") - .setContribPredictionCol("contrib") - .setDevice("cuda") - // .setEvalDataset(validationDf) - // .setValidationIndicatorCol("validation") - // .setPredictionCol("") - // .setRawPredictionCol("") - // .setProbabilityCol("xxxx") - // .setContribPredictionCol("contrb") - // .setLeafPredictionCol("leaf") - // val est = new XGBoostClassifier().setLabelCol(labelCol) - // est.fit(arrayInput) - // est.write.overwrite().save("/tmp/abcdef") - // val loadedEst = XGBoostClassifier.load("/tmp/abcdef") - // println(loadedEst.getNumRound) - // println(loadedEst.getMaxDepth) - - val model = est.fit(trainDf) - - val out = model.transform(df) - out.printSchema() - out.show(150) - // model.write.overwrite().save("/tmp/model/") - // val loadedModel = XGBoostClassificationModel.load("/tmp/model") - // println(loadedModel.getNumRound) - // println(loadedModel.getMaxDepth) - // model.transform(df).drop(features: _*).show(150, false) - } - - } -} diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala index 6c9716089419..cae44ab9aef1 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/Utils.scala @@ -22,7 +22,7 @@ import org.json4s.{DefaultFormats, FullTypeHints, JField, JValue, NoTypeHints, T import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint} -object Utils { +private[scala] object Utils { private[spark] implicit class XGBLabeledPointFeatures( val labeledPoint: XGBLabeledPoint @@ -111,4 +111,14 @@ object Utils { val TRAIN_NAME = "train" val VALIDATION_NAME = "eval" + + + /** Executes the provided code block and then closes the resource */ + def withResource[T <: AutoCloseable, V](r: T)(block: T => V): V = { + try { + block(r) + } finally { + r.close() + } + } } diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala index eebaae4306fd..49b50fcc469f 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala @@ -26,7 +26,7 @@ import org.scalatest.BeforeAndAfterEach import org.scalatest.funsuite.AnyFunSuite import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint} -import ml.dmlc.xgboost4j.scala.spark.Utils.XGBLabeledPointFeatures +import ml.dmlc.xgboost4j.scala.spark.Utils.{withResource, XGBLabeledPointFeatures} trait PerTest extends BeforeAndAfterEach { self: AnyFunSuite => @@ -103,15 +103,6 @@ trait PerTest extends BeforeAndAfterEach { } } - /** Executes the provided code block and then closes the resource */ - protected def withResource[T <: AutoCloseable, V](r: T)(block: T => V): V = { - try { - block(r) - } finally { - r.close() - } - } - def smallBinaryClassificationVector: DataFrame = ss.createDataFrame(sc.parallelize(Seq( (1.0, 0.5, 1.0, Vectors.dense(1.0, 2.0, 3.0)), (0.0, 0.4, -3.0, Vectors.dense(0.0, 0.0, 0.0)), diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala index 32f3ee1bd568..dcd22009514e 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala @@ -112,6 +112,13 @@ class XGBoostClassifierSuite extends AnyFunSuite with PerTest with TmpFolderPerS assert(out.schema.names.contains("leaf")) assert(out.schema.names.contains("contrib")) + + val out1 = classifier.setLeafPredictionCol("leaf1") + .setContribPredictionCol("contrib1") + .train(trainDf).transform(trainDf) + + assert(out1.schema.names.contains("leaf1")) + assert(out1.schema.names.contains("contrib1")) } test("Supported objectives") { diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala index 4a8b59741e63..614e93c8e8cf 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimatorSuite.scala @@ -32,408 +32,422 @@ import ml.dmlc.xgboost4j.scala.spark.Utils.TRAIN_NAME class XGBoostEstimatorSuite extends AnyFunSuite with PerTest with TmpFolderPerSuite { test("params") { + val df = smallBinaryClassificationVector val xgbParams: Map[String, Any] = Map( "max_depth" -> 5, "eta" -> 0.2, "objective" -> "binary:logistic" ) - val estimator = new XGBoostRegressor(xgbParams) - .setFeaturesCol("abc") + val estimator = new XGBoostClassifier(xgbParams) + .setFeaturesCol("features") .setMissing(0.2f) .setAlpha(0.97) + .setLeafPredictionCol("leaf") + .setContribPredictionCol("contrib") + .setNumRound(1) assert(estimator.getMaxDepth === 5) assert(estimator.getEta === 0.2) assert(estimator.getObjective === "binary:logistic") - assert(estimator.getFeaturesCol === "abc") + assert(estimator.getFeaturesCol === "features") assert(estimator.getMissing === 0.2f) assert(estimator.getAlpha === 0.97) estimator.setEta(0.66).setMaxDepth(7) assert(estimator.getMaxDepth === 7) assert(estimator.getEta === 0.66) + + val model = estimator.train(df) + assert(model.getMaxDepth === 7) + assert(model.getEta === 0.66) + assert(model.getObjective === "binary:logistic") + assert(model.getFeaturesCol === "features") + assert(model.getMissing === 0.2f) + assert(model.getAlpha === 0.97) + assert(model.getLeafPredictionCol === "leaf") + assert(model.getContribPredictionCol === "contrib") } -test("nthread") { - val classifier = new XGBoostClassifier().setNthread(100) + test("nthread") { + val classifier = new XGBoostClassifier().setNthread(100) - intercept[IllegalArgumentException]( - classifier.validate(smallBinaryClassificationVector) - ) -} + intercept[IllegalArgumentException]( + classifier.validate(smallBinaryClassificationVector) + ) + } -test("RuntimeParameter") { - var runtimeParams = new XGBoostClassifier( - Map("device" -> "cpu")) - .getRuntimeParameters(true) - assert(!runtimeParams.runOnGpu) - - runtimeParams = new XGBoostClassifier( - Map("device" -> "cuda")).setNumWorkers(1).setNumRound(1) - .getRuntimeParameters(true) - assert(runtimeParams.runOnGpu) - - runtimeParams = new XGBoostClassifier( - Map("device" -> "cpu", "tree_method" -> "gpu_hist")).setNumWorkers(1).setNumRound(1) - .getRuntimeParameters(true) - assert(runtimeParams.runOnGpu) - - runtimeParams = new XGBoostClassifier( - Map("device" -> "cuda", "tree_method" -> "gpu_hist")).setNumWorkers(1).setNumRound(1) - .getRuntimeParameters(true) - assert(runtimeParams.runOnGpu) -} + test("RuntimeParameter") { + var runtimeParams = new XGBoostClassifier( + Map("device" -> "cpu")) + .getRuntimeParameters(true) + assert(!runtimeParams.runOnGpu) + + runtimeParams = new XGBoostClassifier( + Map("device" -> "cuda")).setNumWorkers(1).setNumRound(1) + .getRuntimeParameters(true) + assert(runtimeParams.runOnGpu) + + runtimeParams = new XGBoostClassifier( + Map("device" -> "cpu", "tree_method" -> "gpu_hist")).setNumWorkers(1).setNumRound(1) + .getRuntimeParameters(true) + assert(runtimeParams.runOnGpu) + + runtimeParams = new XGBoostClassifier( + Map("device" -> "cuda", "tree_method" -> "gpu_hist")).setNumWorkers(1).setNumRound(1) + .getRuntimeParameters(true) + assert(runtimeParams.runOnGpu) + } -test("test persistence of XGBoostClassifier and XGBoostClassificationModel " + - "using custom Eval and Obj") { - val trainingDF = buildDataFrame(Classification.train) - val testDM = new DMatrix(Classification.test.iterator) - - val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", - "verbosity" -> "1", "objective" -> "binary:logistic") - - val xgbc = new XGBoostClassifier(paramMap) - .setCustomObj(new CustomObj(1)) - .setCustomEval(new EvalError) - .setNumRound(10) - .setNumWorkers(numWorkers) - - val xgbcPath = new File(tempDir.toFile, "xgbc").getPath - xgbc.write.overwrite().save(xgbcPath) - val xgbc2 = XGBoostClassifier.load(xgbcPath) - - assert(xgbc.getCustomObj.asInstanceOf[CustomObj].customParameter === 1) - assert(xgbc2.getCustomObj.asInstanceOf[CustomObj].customParameter === 1) - - val eval = new EvalError() - - val model = xgbc.fit(trainingDF) - val evalResults = eval.eval(model.nativeBooster.predict(testDM, outPutMargin = true), testDM) - assert(evalResults < 0.1) - val xgbcModelPath = new File(tempDir.toFile, "xgbcModel").getPath - model.write.overwrite.save(xgbcModelPath) - val model2 = XGBoostClassificationModel.load(xgbcModelPath) - assert(Arrays.equals(model.nativeBooster.toByteArray, model2.nativeBooster.toByteArray)) - - assert(model.getEta === model2.getEta) - assert(model.getNumRound === model2.getNumRound) - assert(model.getRawPredictionCol === model2.getRawPredictionCol) - val evalResults2 = eval.eval(model2.nativeBooster.predict(testDM, outPutMargin = true), testDM) - assert(evalResults === evalResults2) -} + test("test persistence of XGBoostClassifier and XGBoostClassificationModel " + + "using custom Eval and Obj") { + val trainingDF = buildDataFrame(Classification.train) + val testDM = new DMatrix(Classification.test.iterator) -test("Check for Spark encryption over-the-wire") { - val originalSslConfOpt = ss.conf.getOption("spark.ssl.enabled") - ss.conf.set("spark.ssl.enabled", true) + val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", + "verbosity" -> "1", "objective" -> "binary:logistic") - val paramMap = Map("eta" -> "1", "max_depth" -> "2", "verbosity" -> "1", - "objective" -> "binary:logistic") - val training = smallBinaryClassificationVector + val xgbc = new XGBoostClassifier(paramMap) + .setCustomObj(new CustomObj(1)) + .setCustomEval(new EvalError) + .setNumRound(10) + .setNumWorkers(numWorkers) - withClue("xgboost-spark should throw an exception when spark.ssl.enabled = true but " + - "xgboost.spark.ignoreSsl != true") { - val thrown = intercept[Exception] { - new XGBoostClassifier(paramMap).setNumRound(2).setNumWorkers(numWorkers).fit(training) - } - assert(thrown.getMessage.contains("xgboost.spark.ignoreSsl") && - thrown.getMessage.contains("spark.ssl.enabled")) + val xgbcPath = new File(tempDir.toFile, "xgbc").getPath + xgbc.write.overwrite().save(xgbcPath) + val xgbc2 = XGBoostClassifier.load(xgbcPath) + + assert(xgbc.getCustomObj.asInstanceOf[CustomObj].customParameter === 1) + assert(xgbc2.getCustomObj.asInstanceOf[CustomObj].customParameter === 1) + + val eval = new EvalError() + + val model = xgbc.fit(trainingDF) + val evalResults = eval.eval(model.nativeBooster.predict(testDM, outPutMargin = true), testDM) + assert(evalResults < 0.1) + val xgbcModelPath = new File(tempDir.toFile, "xgbcModel").getPath + model.write.overwrite.save(xgbcModelPath) + val model2 = XGBoostClassificationModel.load(xgbcModelPath) + assert(Arrays.equals(model.nativeBooster.toByteArray, model2.nativeBooster.toByteArray)) + + assert(model.getEta === model2.getEta) + assert(model.getNumRound === model2.getNumRound) + assert(model.getRawPredictionCol === model2.getRawPredictionCol) + val evalResults2 = eval.eval(model2.nativeBooster.predict(testDM, outPutMargin = true), testDM) + assert(evalResults === evalResults2) } - // Confirm that this check can be overridden. - ss.conf.set("xgboost.spark.ignoreSsl", true) - new XGBoostClassifier(paramMap).setNumRound(2).setNumWorkers(numWorkers).fit(training) + test("Check for Spark encryption over-the-wire") { + val originalSslConfOpt = ss.conf.getOption("spark.ssl.enabled") + ss.conf.set("spark.ssl.enabled", true) + + val paramMap = Map("eta" -> "1", "max_depth" -> "2", "verbosity" -> "1", + "objective" -> "binary:logistic") + val training = smallBinaryClassificationVector + + withClue("xgboost-spark should throw an exception when spark.ssl.enabled = true but " + + "xgboost.spark.ignoreSsl != true") { + val thrown = intercept[Exception] { + new XGBoostClassifier(paramMap).setNumRound(2).setNumWorkers(numWorkers).fit(training) + } + assert(thrown.getMessage.contains("xgboost.spark.ignoreSsl") && + thrown.getMessage.contains("spark.ssl.enabled")) + } - originalSslConfOpt match { - case None => - ss.conf.unset("spark.ssl.enabled") - case Some(originalSslConf) => - ss.conf.set("spark.ssl.enabled", originalSslConf) + // Confirm that this check can be overridden. + ss.conf.set("xgboost.spark.ignoreSsl", true) + new XGBoostClassifier(paramMap).setNumRound(2).setNumWorkers(numWorkers).fit(training) + + originalSslConfOpt match { + case None => + ss.conf.unset("spark.ssl.enabled") + case Some(originalSslConf) => + ss.conf.set("spark.ssl.enabled", originalSslConf) + } + ss.conf.unset("xgboost.spark.ignoreSsl") } - ss.conf.unset("xgboost.spark.ignoreSsl") -} -test("nthread configuration must be no larger than spark.task.cpus") { - val training = smallBinaryClassificationVector - val paramMap = Map("eta" -> "1", "max_depth" -> "2", "verbosity" -> "1", - "objective" -> "binary:logistic") - intercept[IllegalArgumentException] { - new XGBoostClassifier(paramMap) - .setNumWorkers(numWorkers) - .setNumRound(2) - .setNthread(sc.getConf.getInt("spark.task.cpus", 1) + 1) - .fit(training) + test("nthread configuration must be no larger than spark.task.cpus") { + val training = smallBinaryClassificationVector + val paramMap = Map("eta" -> "1", "max_depth" -> "2", "verbosity" -> "1", + "objective" -> "binary:logistic") + intercept[IllegalArgumentException] { + new XGBoostClassifier(paramMap) + .setNumWorkers(numWorkers) + .setNumRound(2) + .setNthread(sc.getConf.getInt("spark.task.cpus", 1) + 1) + .fit(training) + } } -} -test("preprocess dataset") { - val dataset = ss.createDataFrame(sc.parallelize(Seq( - (1.0, 0, 0.5, 1.0, Vectors.dense(1.0, 2.0, 3.0), "a"), - (0.0, 2, -0.5, 0.0, Vectors.dense(0.2, 1.2, 2.0), "b"), - (2.0, 2, -0.4, -2.1, Vectors.dense(0.5, 2.2, 1.7), "c") - ))).toDF("label", "group", "margin", "weight", "features", "other") - - val classifier = new XGBoostClassifier() - .setLabelCol("label") - .setFeaturesCol("features") - .setBaseMarginCol("margin") - .setWeightCol("weight") - - val (df, indices) = classifier.preprocess(dataset) - var schema = df.schema - assert(!schema.names.contains("group") && !schema.names.contains("other")) - assert(indices.labelId == schema.fieldIndex("label") && - indices.groupId.isEmpty && - indices.marginId.get == schema.fieldIndex("margin") && - indices.weightId.get == schema.fieldIndex("weight") && - indices.featureId.get == schema.fieldIndex("features") && - indices.featureIds.isEmpty) - - classifier.setWeightCol("") - val (df1, indices1) = classifier.preprocess(dataset) - schema = df1.schema - Seq("weight", "group", "other").foreach(v => assert(!schema.names.contains(v))) - assert(indices1.labelId == schema.fieldIndex("label") && - indices1.groupId.isEmpty && - indices1.marginId.get == schema.fieldIndex("margin") && - indices1.weightId.isEmpty && - indices1.featureId.get == schema.fieldIndex("features") && - indices1.featureIds.isEmpty) -} + test("preprocess dataset") { + val dataset = ss.createDataFrame(sc.parallelize(Seq( + (1.0, 0, 0.5, 1.0, Vectors.dense(1.0, 2.0, 3.0), "a"), + (0.0, 2, -0.5, 0.0, Vectors.dense(0.2, 1.2, 2.0), "b"), + (2.0, 2, -0.4, -2.1, Vectors.dense(0.5, 2.2, 1.7), "c") + ))).toDF("label", "group", "margin", "weight", "features", "other") + + val classifier = new XGBoostClassifier() + .setLabelCol("label") + .setFeaturesCol("features") + .setBaseMarginCol("margin") + .setWeightCol("weight") + + val (df, indices) = classifier.preprocess(dataset) + var schema = df.schema + assert(!schema.names.contains("group") && !schema.names.contains("other")) + assert(indices.labelId == schema.fieldIndex("label") && + indices.groupId.isEmpty && + indices.marginId.get == schema.fieldIndex("margin") && + indices.weightId.get == schema.fieldIndex("weight") && + indices.featureId.get == schema.fieldIndex("features") && + indices.featureIds.isEmpty) + + classifier.setWeightCol("") + val (df1, indices1) = classifier.preprocess(dataset) + schema = df1.schema + Seq("weight", "group", "other").foreach(v => assert(!schema.names.contains(v))) + assert(indices1.labelId == schema.fieldIndex("label") && + indices1.groupId.isEmpty && + indices1.marginId.get == schema.fieldIndex("margin") && + indices1.weightId.isEmpty && + indices1.featureId.get == schema.fieldIndex("features") && + indices1.featureIds.isEmpty) + } -test("to XGBoostLabeledPoint RDD") { - val data = Array( - Array(1.0, 2.0, 3.0, 4.0, 5.0), - Array(0.0, 0.0, 0.0, 0.0, 2.0), - Array(12.0, 13.0, 14.0, 14.0, 15.0), - Array(20.5, 21.2, 0.0, 0.0, 2.0) - ) - val dataset = ss.createDataFrame(sc.parallelize(Seq( - (1.0, 0, 0.5, 1.0, Vectors.dense(data(0)), "a"), - (2.0, 2, -0.5, 0.0, Vectors.dense(data(1)).toSparse, "b"), - (3.0, 2, -0.5, 0.0, Vectors.dense(data(2)), "b"), - (4.0, 2, -0.4, -2.1, Vectors.dense(data(3)), "c") - ))).toDF("label", "group", "margin", "weight", "features", "other") - - val classifier = new XGBoostClassifier() - .setLabelCol("label") - .setFeaturesCol("features") - .setWeightCol("weight") - .setNumWorkers(2) - - val (df, indices) = classifier.preprocess(dataset) - val rdd = classifier.toXGBLabeledPoint(df, indices) - val result = rdd.collect().sortBy(x => x.label) - - assert(result.length == data.length) - - def toArray(index: Int): Array[Float] = { - val labelPoint = result(index) - if (labelPoint.indices != null) { - Vectors.sparse(labelPoint.size, - labelPoint.indices, - labelPoint.values.map(_.toDouble)).toArray.map(_.toFloat) - } else { - labelPoint.values + test("to XGBoostLabeledPoint RDD") { + val data = Array( + Array(1.0, 2.0, 3.0, 4.0, 5.0), + Array(0.0, 0.0, 0.0, 0.0, 2.0), + Array(12.0, 13.0, 14.0, 14.0, 15.0), + Array(20.5, 21.2, 0.0, 0.0, 2.0) + ) + val dataset = ss.createDataFrame(sc.parallelize(Seq( + (1.0, 0, 0.5, 1.0, Vectors.dense(data(0)), "a"), + (2.0, 2, -0.5, 0.0, Vectors.dense(data(1)).toSparse, "b"), + (3.0, 2, -0.5, 0.0, Vectors.dense(data(2)), "b"), + (4.0, 2, -0.4, -2.1, Vectors.dense(data(3)), "c") + ))).toDF("label", "group", "margin", "weight", "features", "other") + + val classifier = new XGBoostClassifier() + .setLabelCol("label") + .setFeaturesCol("features") + .setWeightCol("weight") + .setNumWorkers(2) + + val (df, indices) = classifier.preprocess(dataset) + val rdd = classifier.toXGBLabeledPoint(df, indices) + val result = rdd.collect().sortBy(x => x.label) + + assert(result.length == data.length) + + def toArray(index: Int): Array[Float] = { + val labelPoint = result(index) + if (labelPoint.indices != null) { + Vectors.sparse(labelPoint.size, + labelPoint.indices, + labelPoint.values.map(_.toDouble)).toArray.map(_.toFloat) + } else { + labelPoint.values + } } - } - assert(result(0).label === 1.0f && result(0).baseMargin.isNaN && - result(0).weight === 1.0f && toArray(0) === data(0).map(_.toFloat)) - assert(result(1).label == 2.0f && result(1).baseMargin.isNaN && - result(1).weight === 0.0f && toArray(1) === data(1).map(_.toFloat)) - assert(result(2).label === 3.0f && result(2).baseMargin.isNaN && - result(2).weight == 0.0f && toArray(2) === data(2).map(_.toFloat)) - assert(result(3).label === 4.0f && result(3).baseMargin.isNaN && - result(3).weight === -2.1f && toArray(3) === data(3).map(_.toFloat)) -} + assert(result(0).label === 1.0f && result(0).baseMargin.isNaN && + result(0).weight === 1.0f && toArray(0) === data(0).map(_.toFloat)) + assert(result(1).label == 2.0f && result(1).baseMargin.isNaN && + result(1).weight === 0.0f && toArray(1) === data(1).map(_.toFloat)) + assert(result(2).label === 3.0f && result(2).baseMargin.isNaN && + result(2).weight == 0.0f && toArray(2) === data(2).map(_.toFloat)) + assert(result(3).label === 4.0f && result(3).baseMargin.isNaN && + result(3).weight === -2.1f && toArray(3) === data(3).map(_.toFloat)) + } -Seq((Float.NaN, 2), (0.0f, 7 + 2), (15.0f, 1 + 2), (10101011.0f, 0 + 2)).foreach { - case (missing, expectedMissingValue) => - test(s"to RDD watches with missing $missing") { - val data = Array( - Array(1.0, 2.0, 3.0, 4.0, 5.0), - Array(1.0, Float.NaN, 0.0, 0.0, 2.0), - Array(12.0, 13.0, Float.NaN, 14.0, 15.0), - Array(0.0, 0.0, 0.0, 0.0, 0.0) - ) - val dataset = ss.createDataFrame(sc.parallelize(Seq( - (1.0, 0, 0.5, 1.0, Vectors.dense(data(0)), "a"), - (2.0, 2, -0.5, 0.0, Vectors.dense(data(1)).toSparse, "b"), - (3.0, 3, -0.5, 0.0, Vectors.dense(data(2)), "b"), - (4.0, 4, -0.4, -2.1, Vectors.dense(data(3)), "c") - ))).toDF("label", "group", "margin", "weight", "features", "other") - - val classifier = new XGBoostClassifier() - .setLabelCol("label") - .setFeaturesCol("features") - .setWeightCol("weight") - .setBaseMarginCol("margin") - .setMissing(missing) - .setNumWorkers(2) - - val (df, indices) = classifier.preprocess(dataset) - val rdd = classifier.toRdd(df, indices) - val result = rdd.mapPartitions { iter => - if (iter.hasNext) { - val watches = iter.next() - val size = watches.size - val trainDM = watches.toMap(TRAIN_NAME) - val rowNum = trainDM.rowNum - val labels = trainDM.getLabel - val weight = trainDM.getWeight - val margins = trainDM.getBaseMargin - val nonMissing = trainDM.nonMissingNum - watches.delete() - Iterator.single((size, rowNum, labels, weight, margins, nonMissing)) - } else { - Iterator.empty + Seq((Float.NaN, 2), (0.0f, 7 + 2), (15.0f, 1 + 2), (10101011.0f, 0 + 2)).foreach { + case (missing, expectedMissingValue) => + test(s"to RDD watches with missing $missing") { + val data = Array( + Array(1.0, 2.0, 3.0, 4.0, 5.0), + Array(1.0, Float.NaN, 0.0, 0.0, 2.0), + Array(12.0, 13.0, Float.NaN, 14.0, 15.0), + Array(0.0, 0.0, 0.0, 0.0, 0.0) + ) + val dataset = ss.createDataFrame(sc.parallelize(Seq( + (1.0, 0, 0.5, 1.0, Vectors.dense(data(0)), "a"), + (2.0, 2, -0.5, 0.0, Vectors.dense(data(1)).toSparse, "b"), + (3.0, 3, -0.5, 0.0, Vectors.dense(data(2)), "b"), + (4.0, 4, -0.4, -2.1, Vectors.dense(data(3)), "c") + ))).toDF("label", "group", "margin", "weight", "features", "other") + + val classifier = new XGBoostClassifier() + .setLabelCol("label") + .setFeaturesCol("features") + .setWeightCol("weight") + .setBaseMarginCol("margin") + .setMissing(missing) + .setNumWorkers(2) + + val (df, indices) = classifier.preprocess(dataset) + val rdd = classifier.toRdd(df, indices) + val result = rdd.mapPartitions { iter => + if (iter.hasNext) { + val watches = iter.next() + val size = watches.size + val trainDM = watches.toMap(TRAIN_NAME) + val rowNum = trainDM.rowNum + val labels = trainDM.getLabel + val weight = trainDM.getWeight + val margins = trainDM.getBaseMargin + val nonMissing = trainDM.nonMissingNum + watches.delete() + Iterator.single((size, rowNum, labels, weight, margins, nonMissing)) + } else { + Iterator.empty + } + }.collect() + + val labels: ArrayBuffer[Float] = ArrayBuffer.empty + val weight: ArrayBuffer[Float] = ArrayBuffer.empty + val margins: ArrayBuffer[Float] = ArrayBuffer.empty + var nonMissingValues = 0L + var totalRows = 0L + + for (row <- result) { + assert(row._1 === 1) + totalRows = totalRows + row._2 + labels.append(row._3: _*) + weight.append(row._4: _*) + margins.append(row._5: _*) + nonMissingValues = nonMissingValues + row._6 } - }.collect() - - val labels: ArrayBuffer[Float] = ArrayBuffer.empty - val weight: ArrayBuffer[Float] = ArrayBuffer.empty - val margins: ArrayBuffer[Float] = ArrayBuffer.empty - var nonMissingValues = 0L - var totalRows = 0L - - for (row <- result) { - assert(row._1 === 1) - totalRows = totalRows + row._2 - labels.append(row._3: _*) - weight.append(row._4: _*) - margins.append(row._5: _*) - nonMissingValues = nonMissingValues + row._6 + assert(totalRows === 4) + assert(nonMissingValues === data.size * data(0).length - expectedMissingValue) + assert(labels.toArray.sorted === Array(1.0f, 2.0f, 3.0f, 4.0f).sorted) + assert(weight.toArray.sorted === Array(0.0f, 0.0f, 1.0f, -2.1f).sorted) + assert(margins.toArray.sorted === Array(-0.5f, -0.5f, -0.4f, 0.5f).sorted) } - assert(totalRows === 4) - assert(nonMissingValues === data.size * data(0).length - expectedMissingValue) - assert(labels.toArray.sorted === Array(1.0f, 2.0f, 3.0f, 4.0f).sorted) - assert(weight.toArray.sorted === Array(0.0f, 0.0f, 1.0f, -2.1f).sorted) - assert(margins.toArray.sorted === Array(-0.5f, -0.5f, -0.4f, 0.5f).sorted) - } -} + } -test("to RDD watches with eval") { - val trainData = Array( - Array(-1.0, -2.0, -3.0, -4.0, -5.0), - Array(2.0, 2.0, 2.0, 3.0, -2.0), - Array(-12.0, -13.0, -14.0, -14.0, -15.0), - Array(-20.5, -21.2, 0.0, 0.0, 2.0) - ) - val trainDataset = ss.createDataFrame(sc.parallelize(Seq( - (11.0, 0, 0.15, 11.0, Vectors.dense(trainData(0)), "a"), - (12.0, 12, -0.15, 10.0, Vectors.dense(trainData(1)).toSparse, "b"), - (13.0, 12, -0.15, 10.0, Vectors.dense(trainData(2)), "b"), - (14.0, 12, -0.14, -12.1, Vectors.dense(trainData(3)), "c") - ))).toDF("label", "group", "margin", "weight", "features", "other") - val evalData = Array( - Array(1.0, 2.0, 3.0, 4.0, 5.0), - Array(0.0, 0.0, 0.0, 0.0, 2.0), - Array(12.0, 13.0, 14.0, 14.0, 15.0), - Array(20.5, 21.2, 0.0, 0.0, 2.0) - ) - val evalDataset = ss.createDataFrame(sc.parallelize(Seq( - (1.0, 0, 0.5, 1.0, Vectors.dense(evalData(0)), "a"), - (2.0, 2, -0.5, 0.0, Vectors.dense(evalData(1)).toSparse, "b"), - (3.0, 2, -0.5, 0.0, Vectors.dense(evalData(2)), "b"), - (4.0, 2, -0.4, -2.1, Vectors.dense(evalData(3)), "c") - ))).toDF("label", "group", "margin", "weight", "features", "other") - - val classifier = new XGBoostClassifier() - .setLabelCol("label") - .setFeaturesCol("features") - .setWeightCol("weight") - .setBaseMarginCol("margin") - .setEvalDataset(evalDataset) - .setNumWorkers(2) - - val (df, indices) = classifier.preprocess(trainDataset) - val rdd = classifier.toRdd(df, indices) - val result = rdd.mapPartitions { iter => - if (iter.hasNext) { - val watches = iter.next() - val size = watches.size - val evalDM = watches.toMap(Utils.VALIDATION_NAME) - val rowNum = evalDM.rowNum - val labels = evalDM.getLabel - val weight = evalDM.getWeight - val margins = evalDM.getBaseMargin - watches.delete() - Iterator.single((size, rowNum, labels, weight, margins)) - } else { - Iterator.empty + test("to RDD watches with eval") { + val trainData = Array( + Array(-1.0, -2.0, -3.0, -4.0, -5.0), + Array(2.0, 2.0, 2.0, 3.0, -2.0), + Array(-12.0, -13.0, -14.0, -14.0, -15.0), + Array(-20.5, -21.2, 0.0, 0.0, 2.0) + ) + val trainDataset = ss.createDataFrame(sc.parallelize(Seq( + (11.0, 0, 0.15, 11.0, Vectors.dense(trainData(0)), "a"), + (12.0, 12, -0.15, 10.0, Vectors.dense(trainData(1)).toSparse, "b"), + (13.0, 12, -0.15, 10.0, Vectors.dense(trainData(2)), "b"), + (14.0, 12, -0.14, -12.1, Vectors.dense(trainData(3)), "c") + ))).toDF("label", "group", "margin", "weight", "features", "other") + val evalData = Array( + Array(1.0, 2.0, 3.0, 4.0, 5.0), + Array(0.0, 0.0, 0.0, 0.0, 2.0), + Array(12.0, 13.0, 14.0, 14.0, 15.0), + Array(20.5, 21.2, 0.0, 0.0, 2.0) + ) + val evalDataset = ss.createDataFrame(sc.parallelize(Seq( + (1.0, 0, 0.5, 1.0, Vectors.dense(evalData(0)), "a"), + (2.0, 2, -0.5, 0.0, Vectors.dense(evalData(1)).toSparse, "b"), + (3.0, 2, -0.5, 0.0, Vectors.dense(evalData(2)), "b"), + (4.0, 2, -0.4, -2.1, Vectors.dense(evalData(3)), "c") + ))).toDF("label", "group", "margin", "weight", "features", "other") + + val classifier = new XGBoostClassifier() + .setLabelCol("label") + .setFeaturesCol("features") + .setWeightCol("weight") + .setBaseMarginCol("margin") + .setEvalDataset(evalDataset) + .setNumWorkers(2) + + val (df, indices) = classifier.preprocess(trainDataset) + val rdd = classifier.toRdd(df, indices) + val result = rdd.mapPartitions { iter => + if (iter.hasNext) { + val watches = iter.next() + val size = watches.size + val evalDM = watches.toMap(Utils.VALIDATION_NAME) + val rowNum = evalDM.rowNum + val labels = evalDM.getLabel + val weight = evalDM.getWeight + val margins = evalDM.getBaseMargin + watches.delete() + Iterator.single((size, rowNum, labels, weight, margins)) + } else { + Iterator.empty + } + }.collect() + + val labels: ArrayBuffer[Float] = ArrayBuffer.empty + val weight: ArrayBuffer[Float] = ArrayBuffer.empty + val margins: ArrayBuffer[Float] = ArrayBuffer.empty + + var totalRows = 0L + for (row <- result) { + assert(row._1 === 2) + totalRows = totalRows + row._2 + labels.append(row._3: _*) + weight.append(row._4: _*) + margins.append(row._5: _*) } - }.collect() - - val labels: ArrayBuffer[Float] = ArrayBuffer.empty - val weight: ArrayBuffer[Float] = ArrayBuffer.empty - val margins: ArrayBuffer[Float] = ArrayBuffer.empty - - var totalRows = 0L - for (row <- result) { - assert(row._1 === 2) - totalRows = totalRows + row._2 - labels.append(row._3: _*) - weight.append(row._4: _*) - margins.append(row._5: _*) + assert(totalRows === 4) + assert(labels.toArray.sorted === Array(1.0f, 2.0f, 3.0f, 4.0f).sorted) + assert(weight.toArray.sorted === Array(0.0f, 0.0f, 1.0f, -2.1f).sorted) + assert(margins.toArray.sorted === Array(-0.5f, -0.5f, -0.4f, 0.5f).sorted) } - assert(totalRows === 4) - assert(labels.toArray.sorted === Array(1.0f, 2.0f, 3.0f, 4.0f).sorted) - assert(weight.toArray.sorted === Array(0.0f, 0.0f, 1.0f, -2.1f).sorted) - assert(margins.toArray.sorted === Array(-0.5f, -0.5f, -0.4f, 0.5f).sorted) -} -test("XGBoost-Spark model format should match xgboost4j") { - val trainingDF = buildDataFrame(MultiClassification.train) - - Seq(new XGBoostClassifier()).foreach { est => - est.setNumRound(5) - val model = est.fit(trainingDF) - - // test json - val modelPath = new File(tempDir.toFile, "xgbc").getPath - model.write.overwrite().option("format", "json").save(modelPath) - val nativeJsonModelPath = new File(tempDir.toFile, "nativeModel.json").getPath - model.nativeBooster.saveModel(nativeJsonModelPath) - assert(compareTwoFiles(new File(modelPath, "data/model").getPath, - nativeJsonModelPath)) - - // test ubj - val modelUbjPath = new File(tempDir.toFile, "xgbcUbj").getPath - model.write.overwrite().save(modelUbjPath) - val nativeUbjModelPath = new File(tempDir.toFile, "nativeModel.ubj").getPath - model.nativeBooster.saveModel(nativeUbjModelPath) - assert(compareTwoFiles(new File(modelUbjPath, "data/model").getPath, - nativeUbjModelPath)) - - // json file should be indifferent with ubj file - val modelJsonPath = new File(tempDir.toFile, "xgbcJson").getPath - model.write.overwrite().option("format", "json").save(modelJsonPath) - val nativeUbjModelPath1 = new File(tempDir.toFile, "nativeModel1.ubj").getPath - model.nativeBooster.saveModel(nativeUbjModelPath1) - assert(!compareTwoFiles(new File(modelJsonPath, "data/model").getPath, - nativeUbjModelPath1)) + test("XGBoost-Spark model format should match xgboost4j") { + val trainingDF = buildDataFrame(MultiClassification.train) + + Seq(new XGBoostClassifier()).foreach { est => + est.setNumRound(5) + val model = est.fit(trainingDF) + + // test json + val modelPath = new File(tempDir.toFile, "xgbc").getPath + model.write.overwrite().option("format", "json").save(modelPath) + val nativeJsonModelPath = new File(tempDir.toFile, "nativeModel.json").getPath + model.nativeBooster.saveModel(nativeJsonModelPath) + assert(compareTwoFiles(new File(modelPath, "data/model").getPath, + nativeJsonModelPath)) + + // test ubj + val modelUbjPath = new File(tempDir.toFile, "xgbcUbj").getPath + model.write.overwrite().save(modelUbjPath) + val nativeUbjModelPath = new File(tempDir.toFile, "nativeModel.ubj").getPath + model.nativeBooster.saveModel(nativeUbjModelPath) + assert(compareTwoFiles(new File(modelUbjPath, "data/model").getPath, + nativeUbjModelPath)) + + // json file should be indifferent with ubj file + val modelJsonPath = new File(tempDir.toFile, "xgbcJson").getPath + model.write.overwrite().option("format", "json").save(modelJsonPath) + val nativeUbjModelPath1 = new File(tempDir.toFile, "nativeModel1.ubj").getPath + model.nativeBooster.saveModel(nativeUbjModelPath1) + assert(!compareTwoFiles(new File(modelJsonPath, "data/model").getPath, + nativeUbjModelPath1)) + } } -} -test("native json model file should store feature_name and feature_type") { - val featureNames = (1 to 33).map(idx => s"feature_${idx}").toArray - val featureTypes = (1 to 33).map(idx => "q").toArray - val trainingDF = buildDataFrame(MultiClassification.train) - val xgb = new XGBoostClassifier() - .setNumWorkers(numWorkers) - .setFeatureNames(featureNames) - .setFeatureTypes(featureTypes) - .setNumRound(2) - val model = xgb.fit(trainingDF) - val modelStr = new String(model.nativeBooster.toByteArray("json")) - val jsonModel = parseJson(modelStr) - implicit val formats: Formats = DefaultFormats - val featureNamesInModel = (jsonModel \ "learner" \ "feature_names").extract[List[String]] - val featureTypesInModel = (jsonModel \ "learner" \ "feature_types").extract[List[String]] - assert(featureNamesInModel.length == 33) - assert(featureTypesInModel.length == 33) - assert(featureNames sameElements featureNamesInModel) - assert(featureTypes sameElements featureTypesInModel) -} + test("native json model file should store feature_name and feature_type") { + val featureNames = (1 to 33).map(idx => s"feature_${idx}").toArray + val featureTypes = (1 to 33).map(idx => "q").toArray + val trainingDF = buildDataFrame(MultiClassification.train) + val xgb = new XGBoostClassifier() + .setNumWorkers(numWorkers) + .setFeatureNames(featureNames) + .setFeatureTypes(featureTypes) + .setNumRound(2) + val model = xgb.fit(trainingDF) + val modelStr = new String(model.nativeBooster.toByteArray("json")) + val jsonModel = parseJson(modelStr) + implicit val formats: Formats = DefaultFormats + val featureNamesInModel = (jsonModel \ "learner" \ "feature_names").extract[List[String]] + val featureTypesInModel = (jsonModel \ "learner" \ "feature_types").extract[List[String]] + assert(featureNamesInModel.length == 33) + assert(featureTypesInModel.length == 33) + assert(featureNames sameElements featureNamesInModel) + assert(featureTypes sameElements featureTypesInModel) + } } diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java index 7bb8279c12b4..3fa3c692fcb5 100644 --- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java +++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/DMatrix.java @@ -1,5 +1,5 @@ /* - Copyright (c) 2014-2023 by Contributors + Copyright (c) 2014-2024 by Contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -28,18 +28,10 @@ public class DMatrix { protected long handle = 0; - /** - * sparse matrix type (CSR or CSC) - */ - public static enum SparseType { - CSR, - CSC; - } - /** * Create DMatrix from iterator. * - * @param iter The data iterator of mini batch to provide the data. + * @param iter The data iterator of mini batch to provide the data. * @param cacheInfo Cache path information, used for external memory setting, can be null. * @throws XGBoostError */ @@ -50,9 +42,9 @@ public DMatrix(Iterator iter, String cacheInfo) throws XGBoostErro /** * Create DMatrix from iterator. * - * @param iter The data iterator of mini batch to provide the data. + * @param iter The data iterator of mini batch to provide the data. * @param cacheInfo Cache path information, used for external memory setting, can be null. - * @param missing the missing value + * @param missing the missing value * @throws XGBoostError */ public DMatrix(Iterator iter, @@ -87,10 +79,11 @@ public DMatrix(String dataPath) throws XGBoostError { /** * Create DMatrix from Sparse matrix in CSR/CSC format. + * * @param headers The row index of the matrix. * @param indices The indices of presenting entries. - * @param data The data content. - * @param st Type of sparsity. + * @param data The data content. + * @param st Type of sparsity. * @throws XGBoostError */ @Deprecated @@ -101,12 +94,13 @@ public DMatrix(long[] headers, int[] indices, float[] data, /** * Create DMatrix from Sparse matrix in CSR/CSC format. - * @param headers The row index of the matrix. - * @param indices The indices of presenting entries. - * @param data The data content. - * @param st Type of sparsity. - * @param shapeParam when st is CSR, it specifies the column number, otherwise it is taken as - * row number + * + * @param headers The row index of the matrix. + * @param indices The indices of presenting entries. + * @param data The data content. + * @param st Type of sparsity. + * @param shapeParam when st is CSR, it specifies the column number, otherwise it is taken as + * row number * @throws XGBoostError */ public DMatrix(long[] headers, int[] indices, float[] data, DMatrix.SparseType st, @@ -136,7 +130,6 @@ public DMatrix(long[] headers, int[] indices, float[] data, DMatrix.SparseType s * @param nrow number of rows * @param ncol number of columns * @throws XGBoostError native error - * * @deprecated Please specify the missing value explicitly using * {@link DMatrix(float[], int, int, float)} */ @@ -159,9 +152,10 @@ public DMatrix(BigDenseMatrix matrix) throws XGBoostError { /** * create DMatrix from dense matrix - * @param data data values - * @param nrow number of rows - * @param ncol number of columns + * + * @param data data values + * @param nrow number of rows + * @param ncol number of columns * @param missing the specified value to represent the missing value */ public DMatrix(float[] data, int nrow, int ncol, float missing) throws XGBoostError { @@ -172,13 +166,14 @@ public DMatrix(float[] data, int nrow, int ncol, float missing) throws XGBoostEr /** * create DMatrix from dense matrix - * @param matrix instance of BigDenseMatrix + * + * @param matrix instance of BigDenseMatrix * @param missing the specified value to represent the missing value */ public DMatrix(BigDenseMatrix matrix, float missing) throws XGBoostError { long[] out = new long[1]; XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixCreateFromMatRef(matrix.address, matrix.nrow, - matrix.ncol, missing, out)); + matrix.ncol, missing, out)); handle = out[0]; } @@ -191,10 +186,11 @@ protected DMatrix(long handle) { /** * Create the normal DMatrix from column array interface - * @param columnBatch the XGBoost ColumnBatch to provide the cuda array interface + * + * @param columnBatch the XGBoost ColumnBatch to provide the array interface * of feature columns - * @param missing missing value - * @param nthread threads number + * @param missing missing value + * @param nthread threads number * @throws XGBoostError */ public DMatrix(ColumnBatch columnBatch, float missing, int nthread) throws XGBoostError { @@ -204,41 +200,35 @@ public DMatrix(ColumnBatch columnBatch, float missing, int nthread) throws XGBoo throw new XGBoostError("Expecting non-empty feature columns' array interface"); } XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixCreateFromArrayInterfaceColumns( - json, missing, nthread, out)); + json, missing, nthread, out)); handle = out[0]; } /** - * Set label of DMatrix from cuda array interface - * - * @param column the XGBoost Column to provide the cuda array interface - * of label column - * @throws XGBoostError native error + * flatten a mat to array */ - public void setLabel(Column column) throws XGBoostError { - setXGBDMatrixInfo("label", column.toJson()); - } + private static float[] flatten(float[][] mat) { + int size = 0; + for (float[] array : mat) size += array.length; + float[] result = new float[size]; + int pos = 0; + for (float[] ar : mat) { + System.arraycopy(ar, 0, result, pos, ar.length); + pos += ar.length; + } - /** - * Set weight of DMatrix from cuda array interface - * - * @param column the XGBoost Column to provide the cuda array interface - * of weight column - * @throws XGBoostError native error - */ - public void setWeight(Column column) throws XGBoostError { - setXGBDMatrixInfo("weight", column.toJson()); + return result; } /** - * Set base margin of DMatrix from cuda array interface + * Set query id of DMatrix from array interface * - * @param column the XGBoost Column to provide the cuda array interface - * of base margin column + * @param column the XGBoost Column to provide the array interface + * of query id column * @throws XGBoostError native error */ - public void setBaseMargin(Column column) throws XGBoostError { - setXGBDMatrixInfo("base_margin", column.toJson()); + public void setQueryId(Column column) throws XGBoostError { + setXGBDMatrixInfo("qid", column.toJson()); } private void setXGBDMatrixInfo(String type, String json) throws XGBoostError { @@ -272,17 +262,9 @@ private String[] getXGBDMatrixFeatureInfo(String type) throws XGBoostError { return outValue[0]; } - /** - * Set feature names - * @param values feature names to be set - * @throws XGBoostError - */ - public void setFeatureNames(String[] values) throws XGBoostError { - setXGBDMatrixFeatureInfo("feature_name", values); - } - /** * Get feature names + * * @return an array of feature names to be returned * @throws XGBoostError */ @@ -291,16 +273,18 @@ public String[] getFeatureNames() throws XGBoostError { } /** - * Set feature types - * @param values feature types to be set + * Set feature names + * + * @param values feature names to be set * @throws XGBoostError */ - public void setFeatureTypes(String[] values) throws XGBoostError { - setXGBDMatrixFeatureInfo("feature_type", values); + public void setFeatureNames(String[] values) throws XGBoostError { + setXGBDMatrixFeatureInfo("feature_name", values); } /** * Get feature types + * * @return an array of feature types to be returned * @throws XGBoostError */ @@ -309,46 +293,23 @@ public String[] getFeatureTypes() throws XGBoostError { } /** - * set label of dmatrix + * Set feature types * - * @param labels labels - * @throws XGBoostError native error + * @param values feature types to be set + * @throws XGBoostError */ - public void setLabel(float[] labels) throws XGBoostError { - XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixSetFloatInfo(handle, "label", labels)); + public void setFeatureTypes(String[] values) throws XGBoostError { + setXGBDMatrixFeatureInfo("feature_type", values); } /** - * set weight of each instance + * Get group sizes of DMatrix * - * @param weights weights + * @return group size as array * @throws XGBoostError native error */ - public void setWeight(float[] weights) throws XGBoostError { - XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixSetFloatInfo(handle, "weight", weights)); - } - - /** - * Set base margin (initial prediction). - * - * The margin must have the same number of elements as the number of - * rows in this matrix. - */ - public void setBaseMargin(float[] baseMargin) throws XGBoostError { - if (baseMargin.length != rowNum()) { - throw new IllegalArgumentException(String.format( - "base margin must have exactly %s elements, got %s", - rowNum(), baseMargin.length)); - } - - XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixSetFloatInfo(handle, "base_margin", baseMargin)); - } - - /** - * Set base margin (initial prediction). - */ - public void setBaseMargin(float[][] baseMargin) throws XGBoostError { - setBaseMargin(flatten(baseMargin)); + public int[] getGroup() throws XGBoostError { + return getIntInfo("group_ptr"); } /** @@ -361,16 +322,6 @@ public void setGroup(int[] group) throws XGBoostError { XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixSetUIntInfo(handle, "group", group)); } - /** - * Get group sizes of DMatrix - * - * @throws XGBoostError native error - * @return group size as array - */ - public int[] getGroup() throws XGBoostError { - return getIntInfo("group_ptr"); - } - /** * Set query ids (used for ranking) * @@ -403,6 +354,27 @@ public float[] getLabel() throws XGBoostError { return getFloatInfo("label"); } + /** + * Set label of DMatrix from array interface + * + * @param column the XGBoost Column to provide the array interface + * of label column + * @throws XGBoostError native error + */ + public void setLabel(Column column) throws XGBoostError { + setXGBDMatrixInfo("label", column.toJson()); + } + + /** + * set label of dmatrix + * + * @param labels labels + * @throws XGBoostError native error + */ + public void setLabel(float[] labels) throws XGBoostError { + XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixSetFloatInfo(handle, "label", labels)); + } + /** * get weight of the DMatrix * @@ -413,6 +385,27 @@ public float[] getWeight() throws XGBoostError { return getFloatInfo("weight"); } + /** + * Set weight of DMatrix from array interface + * + * @param column the XGBoost Column to provide the array interface + * of weight column + * @throws XGBoostError native error + */ + public void setWeight(Column column) throws XGBoostError { + setXGBDMatrixInfo("weight", column.toJson()); + } + + /** + * set weight of each instance + * + * @param weights weights + * @throws XGBoostError native error + */ + public void setWeight(float[] weights) throws XGBoostError { + XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixSetFloatInfo(handle, "weight", weights)); + } + /** * Get base margin of the DMatrix. */ @@ -420,6 +413,40 @@ public float[] getBaseMargin() throws XGBoostError { return getFloatInfo("base_margin"); } + /** + * Set base margin of DMatrix from array interface + * + * @param column the XGBoost Column to provide the array interface + * of base margin column + * @throws XGBoostError native error + */ + public void setBaseMargin(Column column) throws XGBoostError { + setXGBDMatrixInfo("base_margin", column.toJson()); + } + + /** + * Set base margin (initial prediction). + *

+ * The margin must have the same number of elements as the number of + * rows in this matrix. + */ + public void setBaseMargin(float[] baseMargin) throws XGBoostError { + if (baseMargin.length != rowNum()) { + throw new IllegalArgumentException(String.format( + "base margin must have exactly %s elements, got %s", + rowNum(), baseMargin.length)); + } + + XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixSetFloatInfo(handle, "base_margin", baseMargin)); + } + + /** + * Set base margin (initial prediction). + */ + public void setBaseMargin(float[][] baseMargin) throws XGBoostError { + setBaseMargin(flatten(baseMargin)); + } + /** * Slice the DMatrix and return a new DMatrix that only contains `rowIndex`. * @@ -473,22 +500,6 @@ public long getHandle() { return handle; } - /** - * flatten a mat to array - */ - private static float[] flatten(float[][] mat) { - int size = 0; - for (float[] array : mat) size += array.length; - float[] result = new float[size]; - int pos = 0; - for (float[] ar : mat) { - System.arraycopy(ar, 0, result, pos, ar.length); - pos += ar.length; - } - - return result; - } - @Override protected void finalize() { dispose(); @@ -500,4 +511,12 @@ public synchronized void dispose() { handle = 0; } } + + /** + * sparse matrix type (CSR or CSC) + */ + public enum SparseType { + CSR, + CSC + } } diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala index 3aaaeda0c894..294107f082fa 100644 --- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala +++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala @@ -1,5 +1,5 @@ /* - Copyright (c) 2014-2023 by Contributors + Copyright (c) 2014-2024 by Contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -33,13 +33,13 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) { } /** - * init DMatrix from Iterator of LabeledPoint - * - * @param dataIter An iterator of LabeledPoint - * @param cacheInfo Cache path information, used for external memory setting, null by default. - * @param missing Which value will be treated as the missing value - * @throws XGBoostError native error - */ + * init DMatrix from Iterator of LabeledPoint + * + * @param dataIter An iterator of LabeledPoint + * @param cacheInfo Cache path information, used for external memory setting, null by default. + * @param missing Which value will be treated as the missing value + * @throws XGBoostError native error + */ def this(dataIter: Iterator[LabeledPoint], cacheInfo: String = null, missing: Float = Float.NaN) { @@ -63,12 +63,12 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) { /** * create DMatrix from sparse matrix * - * @param headers index to headers (rowHeaders for CSR or colHeaders for CSC) - * @param indices Indices (colIndexs for CSR or rowIndexs for CSC) - * @param data non zero values (sequence by row for CSR or by col for CSC) - * @param st sparse matrix type (CSR or CSC) + * @param headers index to headers (rowHeaders for CSR or colHeaders for CSC) + * @param indices Indices (colIndexs for CSR or rowIndexs for CSC) + * @param data non zero values (sequence by row for CSR or by col for CSC) + * @param st sparse matrix type (CSR or CSC) * @param shapeParam when st is CSR, it specifies the column number, otherwise it is taken as - * row number + * row number */ @throws(classOf[XGBoostError]) def this(headers: Array[Long], indices: Array[Int], data: Array[Float], st: JDMatrix.SparseType, @@ -79,14 +79,14 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) { /** * create DMatrix from sparse matrix * - * @param headers index to headers (rowHeaders for CSR or colHeaders for CSC) - * @param indices Indices (colIndexs for CSR or rowIndexs for CSC) - * @param data non zero values (sequence by row for CSR or by col for CSC) - * @param st sparse matrix type (CSR or CSC) + * @param headers index to headers (rowHeaders for CSR or colHeaders for CSC) + * @param indices Indices (colIndexs for CSR or rowIndexs for CSC) + * @param data non zero values (sequence by row for CSR or by col for CSC) + * @param st sparse matrix type (CSR or CSC) * @param shapeParam when st is CSR, it specifies the column number, otherwise it is taken as - * row number - * @param missing missing value - * @param nthread The number of threads used for constructing DMatrix + * row number + * @param missing missing value + * @param nthread The number of threads used for constructing DMatrix */ @throws(classOf[XGBoostError]) def this(headers: Array[Long], indices: Array[Int], data: Array[Float], st: JDMatrix.SparseType, @@ -96,10 +96,11 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) { /** * Create the normal DMatrix from column array interface + * * @param columnBatch the XGBoost ColumnBatch to provide the cuda array interface * of feature columns - * @param missing missing value - * @param nthread The number of threads used for constructing DMatrix + * @param missing missing value + * @param nthread The number of threads used for constructing DMatrix */ @throws(classOf[XGBoostError]) def this(columnBatch: ColumnBatch, missing: Float, nthread: Int) { @@ -122,9 +123,9 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) { /** * create DMatrix from dense matrix * - * @param data data values - * @param nrow number of rows - * @param ncol number of columns + * @param data data values + * @param nrow number of rows + * @param ncol number of columns * @param missing the specified value to represent the missing value */ @throws(classOf[XGBoostError]) @@ -218,8 +219,17 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) { jDMatrix.setBaseMargin(column) } + /** + * set query id of dmatrix from column array interface + */ + @throws(classOf[XGBoostError]) + def setQueryId(column: Column): Unit = { + jDMatrix.setQueryId(column) + } + /** * set feature names + * * @param values feature names * @throws ml.dmlc.xgboost4j.java.XGBoostError */ @@ -230,6 +240,7 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) { /** * set feature types + * * @param values feature types * @throws ml.dmlc.xgboost4j.java.XGBoostError */ @@ -278,6 +289,7 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) { /** * get feature names + * * @throws ml.dmlc.xgboost4j.java.XGBoostError * @return */ @@ -288,6 +300,7 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) { /** * get feature types + * * @throws ml.dmlc.xgboost4j.java.XGBoostError * @return */ diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu b/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu index b784b21ec5f6..a705751b1583 100644 --- a/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu +++ b/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu @@ -104,7 +104,8 @@ void CopyInterface(std::vector> &interface_arr, } } -void CopyMetaInfo(Json *p_interface, dh::device_vector *out, cudaStream_t stream) { +template +void CopyMetaInfo(Json *p_interface, dh::device_vector *out, cudaStream_t stream) { auto &j_interface = *p_interface; CHECK_EQ(get(j_interface).size(), 1); auto object = get(get(j_interface)[0]); @@ -151,9 +152,11 @@ class DataIteratorProxy { std::vector>> labels_; std::vector>> weights_; std::vector>> base_margins_; + std::vector>> qids_; std::vector label_interfaces_; std::vector weight_interfaces_; std::vector margin_interfaces_; + std::vector qid_interfaces_; size_t it_{0}; size_t n_batches_{0}; @@ -220,6 +223,16 @@ class DataIteratorProxy { Json::Dump(basemargin, &str); XGDMatrixSetInfoFromInterface(proxy_, "base_margin", str.c_str()); } + + if (json_map.find("qid") != json_map.cend()) { + Json qid = json_interface["qid"]; + qids_.emplace_back(new dh::device_vector); + CopyMetaInfo(&qid, qids_.back().get(), copy_stream_); + qid_interfaces_.emplace_back(qid); + + Json::Dump(qid, &str); + XGDMatrixSetInfoFromInterface(proxy_, "qid", str.c_str()); + } } void CloseJvmBatch() { @@ -337,6 +350,12 @@ class DataIteratorProxy { XGDMatrixSetInfoFromInterface(proxy_, "base_margin", str.c_str()); } + if (n_batches_ == this->qid_interfaces_.size()) { + auto const &qid = this->qid_interfaces_.at(it_); + Json::Dump(qid, &str); + XGDMatrixSetInfoFromInterface(proxy_, "qid", str.c_str()); + } + // Data auto const &json_interface = host_columns_.at(it_)->interfaces;