We will now take a look at the statistics of numeric features:
import org.apache.spark.sql.types._val numericFeatures = trainDF.schema.filter(_.dataType != StringType)val description = trainDF.describe(numericFeatures.map(_.name): _*)val quantils = numericFeatures .map(f=>trainDF.stat.approxQuantile(f.name, Array(.25,.5,.75),0)).transposeval rowSeq = Seq(Seq("q1"+:quantils(0): _*), Seq("median"+:quantils(1): _*), Seq("q3"+:quantils(2): _*))val rows = rowSeq.map(s=> s match{ case Seq(a:String,b:Double,c:Double,d:Double, e:Double,f:Double,g:Double, h:Double,i:Double,j:Double,k:Double)=> (a,b,c,d,e,f,g,h,i,j,k)}) val allStats = description.unionAll(sc.parallelize(rows).toDF) allStats.registerTempTable("allStats") ...