/** * 文档主题生成模型(LDA) */ objectb_LDAextendsApp{ val conf = newSparkConf().setAppName("b_LDA") //设置master local[4] 指定本地模式开启模拟worker线程数 conf.setMaster("local[4]") //创建sparkContext文件 val sc = newSparkContext(conf) val spark = SparkSession.builder().getOrCreate() sc.setLogLevel("Error")
val dataset = spark.read.format("libsvm").load("D:\\data\\sample_libsvm_data.txt")
import org.apache.spark.ml.clustering.LDA
// Trains a LDA model. val lda = newLDA().setK(10).setMaxIter(10) val model = lda.fit(dataset)
val ll = model.logLikelihood(dataset) val lp = model.logPerplexity(dataset) println(s"The lower bound on the log likelihood of the entire corpus: $ll") // The lower bound on the log likelihood of the entire corpus: -1.2923084327880664E7 println(s"The upper bound bound on perplexity: $lp") // The upper bound bound on perplexity: 5.308987518591441
// 描述主题. val topics = model.describeTopics(3) println("The topics described by their top-weighted terms:") topics.show(3) /** The topics described by their top-weighted terms: +-----+---------------+--------------------+ |topic| termIndices| termWeights| +-----+---------------+--------------------+ | 0|[597, 569, 598]|[0.01126906771778...| | 1|[415, 398, 601]|[0.00950113504163...| | 2|[261, 233, 260]|[0.01749002981877...| +-----+---------------+--------------------+ */ // Shows the result. val transformed = model.transform(dataset) transformed.show(3) /** +-----+--------------------+--------------------+ |label| features| topicDistribution| +-----+--------------------+--------------------+ | 0.0|(692,[127,128,129...|[0.80140785875680...| | 1.0|(692,[158,159,160...|[0.04601240753292...| | 1.0|(692,[124,125,126...|[5.36847481605803...| +-----+--------------------+--------------------+ */ }
/** * Created by admin on 2018/4/27. * 二分K均值算法 */ objectBisectingKMeansextendsApp{ val conf = newSparkConf().setAppName("b_LDA") //设置master local[4] 指定本地模式开启模拟worker线程数 conf.setMaster("local[4]") //创建sparkContext文件 val sc = newSparkContext(conf) val spark = SparkSession.builder().getOrCreate() sc.setLogLevel("Error")
val dataset = spark.read.format("libsvm").load("D:\\data\\sample_kmeans_data.txt")
// Trains a bisecting k-means model. val bkm = newBisectingKMeans().setK(2).setSeed(1) val model = bkm.fit(dataset)
// Evaluate clustering. val cost = model.computeCost(dataset) println(s"Within Set Sum of Squared Errors = $cost") //Within Set Sum of Squared Errors = 0.11999999999994547 // Shows the result. println("Cluster Centers: ") val centers = model.clusterCenters centers.foreach(println) /** Cluster Centers: [0.1,0.1,0.1] [9.1,9.1,9.1] */
/** * 高斯混合模型 */ objectGaussianMixtureextendsApp{ val conf = newSparkConf().setAppName("d_GaussianMixture") //设置master local[4] 指定本地模式开启模拟worker线程数 conf.setMaster("local[4]") //创建sparkContext文件 val sc = newSparkContext(conf) val spark = SparkSession.builder().getOrCreate() sc.setLogLevel("Error")
val dataset = spark.read.format("libsvm").load("D:\\data\\sample_kmeans_data.txt")
import org.apache.spark.ml.clustering.GaussianMixture // Trains Gaussian Mixture Model val gmm = newGaussianMixture() .setK(2) val model = gmm.fit(dataset) // output parameters of mixture model model for (i <- 0 until model.getK) { println("weight=%f\nmu=%s\nsigma=\n%s\n" format (model.weights(i), model.gaussians(i).mean, model.gaussians(i).cov)) }