Machine learning with Spark
MLlib is Spark's scalable machine learning library containing common learning algorithms and utilities.
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF
from pyspark.ml.classification import LogisticRegression
# Prepare training data
training = spark.createDataFrame([
(0, "spark is great", 1.0),
(1, "mllib is awesome", 1.0),
(2, "slow expensive", 0.0)
], ["id", "text", "label"])
# Configure pipeline
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.01)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
# Train model
model = pipeline.fit(training)