Intel-bigdata · bobjiang82 · Jul 15, 2020 · Jul 15, 2020 · Jul 15, 2020 · Jul 17, 2020
diff --git a/conf/hibench.conf b/conf/hibench.conf
@@ -1,6 +1,6 @@
 # Data scale profile. Available value is tiny, small, large, huge, gigantic and bigdata.
 # The definition of these profiles can be found in the workload's conf file i.e. conf/workloads/micro/wordcount.conf
-hibench.scale.profile                tiny
+hibench.scale.profile                large
 # Mapper number in hadoop, partition number in Spark
 hibench.default.map.parallelism         8
 

diff --git a/conf/workloads/ml/xgboost.conf b/conf/workloads/ml/xgboost.conf
@@ -17,7 +17,7 @@ hibench.xgboost.features                    ${hibench.xgboost.${hibench.scale.pr
 hibench.xgboost.partitions                  ${hibench.default.map.parallelism}
 
 hibench.xgboost.numClasses                  2
-hibench.xgboost.maxDepth                    30
+hibench.xgboost.maxDepth                    8
 hibench.xgboost.maxBins                     32
 hibench.xgboost.numIterations               20
 hibench.xgboost.learningRate                0.1

diff --git a/docs/run-sparkbench.md b/docs/run-sparkbench.md
@@ -88,3 +88,59 @@ hibench.yarn.executor.num   |   Spark executor number in Yarn mode
 hibench.yarn.executor.cores  |  Spark executor cores in Yarn mode
 spark.executor.memory  | Spark executor memory
 spark.driver.memory    | Spark driver memory
+
+
+### 8. Run xgboost workload ###
+
+Hibench xgboost benchmark depends on the xgboost libraries to build and run. The libs are ```xgboost4j_<scala version>-<xgboost version>.jar``` and ```xgboost4j-spark_<scala version>-<xgboost version>.jar```.<br>
+The relevant configurations are in ```./sparkbench/ml/pom.xml```
+```
+    <dependency>
+      <groupId>ml.dmlc</groupId>
+      <artifactId>xgboost4j_${scala.binary.version}</artifactId>
+      <version>1.1.0</version>
+    </dependency>
+    <dependency>
+      <groupId>ml.dmlc</groupId>
+      <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
+      <version>1.1.0</version>
+    </dependency>
+```
+and ```./pom.xml```
+```
+    <repository>
+      <id>xgboostrepo</id>
+      <name>XGBoost Maven Repo</name>
+      <url>https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release</url>
+      <releases>
+        <enabled>true</enabled>
+      </releases>
+      <snapshots>
+        <enabled>false</enabled>
+      </snapshots>
+    </repository>
+```
+
+#### 8.a latest xgboost release (default) ####
+
+By default, the hibench xgboost benchmark is configured to use the latest xgboost release from https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release.<br>
+To use it, simply build hibench, prepare data and run xgboost benchmark. For example,
+```
+$ mvn -Psparkbench -Dmodules -Pml -Dspark=2.4 -Dscala=2.12 clean package
+$ bin/workloads/ml/xgboost/prepare/prepare.sh && hdfs dfs -du -s -h /HiBench/XGBoost/Input
+$ bin/workloads/ml/xgboost/spark/run.sh
+```
+
+#### 8.b other xgboost releases ####
+
+To use other xgboost releases, change the xgboost versions for xgboost4j and xgboost4j-spark to the target versions in ```./sparkbench/ml/pom.xml```. The ```scala.binary.version``` can be specified by command line parameter ```-Dscala```.<br>
+e.g. to use xgboost v1.0.0, change ```<version>1.1.0</version>``` to ```<version>1.0.0</version>``` for both xgboost4j and xgboost4j-spark.<br>
+If the xgboost release is from other maven repo, update the xgboostrepo url in ```./pom.xml``` as well.<br>
+After that, build hibench, prepare data and run xgboost benchmark.
+
+#### 8.c xgboost jar files ####
+
+If you only have the xgboost jar files, just copy them to $SPARK_HOME/jars/ and update the relevant versions for xgboost4j and xgboost4j-spark in sparkbench/ml/pom.xml to get aligned.<br>
+For example, if xgboost is built from source on a Linux platform, the jars will be generated and installed to ```~/.m2/repository/ml/dmlc/xgboost4j_<scala version>/<xgboost version>-SNAPSHOT/``` and ```~/.m2/repository/ml/dmlc/xgboost4j-spark_<scala version>/<xgboost version>-SNAPSHOT/``` respectively. To use them, copy the 2 jars to $SPARK_HOME/jars/ and update the relevant versions for xgboost4j and xgboost4j-spark in the pom.xml files.<br>
+After that, build hibench, prepare data and run xgboost benchmark.
+
diff --git a/pom.xml b/pom.xml
@@ -80,6 +80,17 @@
       <name>Scala-tools Maven 2 Repository</name>
       <url>https://oss.sonatype.org/content/groups/scala-tools/</url>
     </repository>
+    <repository>
+      <id>xgboostrepo</id>
+      <name>XGBoost Maven Repo</name>
+      <url>https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release</url>
+      <releases>
+	      <enabled>true</enabled>
+      </releases>
+      <snapshots>
+        <enabled>false</enabled>
+      </snapshots>
+    </repository>
   </repositories>
   <pluginRepositories>
     <pluginRepository>

diff --git a/sparkbench/ml/pom.xml b/sparkbench/ml/pom.xml
@@ -56,12 +56,12 @@
       <dependency>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-        <version>1.0.0</version>
+        <version>1.1.0</version>
       </dependency>
       <dependency>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
-        <version>1.0.0</version>
+        <version>1.1.0</version>
       </dependency>
   </dependencies>
 </project>
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala
@@ -18,11 +18,9 @@
 package com.intel.hibench.sparkbench.ml
 
 import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.mllib.tree.GradientBoostedTrees
-import org.apache.spark.mllib.tree.configuration.BoostingStrategy
-import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}
 import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
@@ -33,7 +31,7 @@ object XGBoost {
 
   case class Params(
     numClasses: Int = 2,
-    maxDepth: Int = 30,
+    maxDepth: Int = 8,
     maxBins: Int = 32,
     numIterations: Int = 20,
     learningRate: Double = 0.1,
@@ -93,7 +91,7 @@ object XGBoost {
     val mllibRDD: RDD[LabeledPoint] = sc.objectFile(dataPath)
     // Convert to ML LabeledPoint and to DataFrame
     val mlRDD: RDD[NewLabeledPoint] = mllibRDD.map { p => NewLabeledPoint(p.label, p.features.asML) }
-    val data = mlRDD.toDF
+    val data = mlRDD.toDF("label", "features")
 
     // Split the data into training and test sets (30% held out for testing)
     val splits = data.randomSplit(Array(0.7, 0.3))
@@ -123,7 +121,9 @@ object XGBoost {
       setFeaturesCol("features").
       setLabelCol("label")
 
-    val model = xgbClassifier.fit(trainingData)
+    val pipeline = new Pipeline().setStages(Array(xgbClassifier))
+
+    val model = pipeline.fit(trainingData)
 
     // Make predictions.
     val predictions = model.transform(testData)