import numpy as np from pyspark.mllib.clustering import GussianMixture, \ GussianMixtureModel data = sc.textFile("data/mllib/gmm_data.txt") parsed_data = data.map(lambda line: np.array[float(i) for i in line.strip()]))
gmm = GaussianMixture.train(parsed_data, 2) for w, g in zip(gmm.weights, gmm.gaussians): print("weight = ", w, "mu = ", g.mu, "sigma = ", g.sigma.toArray()) gmm.save(sc, "model_path") same_model = GussainMixtureModel.load(sc, "model_path")
Latent Dirichlet Allocation(LDA)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
from pyspark.mllib.clustering import LDA, LDAModel from pyspark.mllib.linalg import Vectors
data = sc.textFile("data/mllib/sample_lda_data.txt") parsed_data = data.map(lambda line: Vector.dense([float(i) for i in line.strip()])) corpus = parsed_data.zipWithIndex() \ .map(lambda x: [x[1], x[0]).cache() ldaModel = LDA.train(corpus, k=3) topics = ldaModel.topicsMatrix()
for word in range(0, ldaModel.vocabSize()): for topic in word: print(topic) ldaModel.save(sc, "model_path") same_model = LDAModel.load("model_path")