зеркало из https://github.com/microsoft/spark.git
Clean up scaladoc in ML Lib.
Also build and copy ML Lib scaladoc in Spark docs build. Some more minor cleanup with respect to naming, test locations etc.
This commit is contained in:
Родитель
e5b9ed2833
Коммит
4935a2558b
|
@ -20,7 +20,7 @@ include FileUtils
|
||||||
|
|
||||||
if ENV['SKIP_API'] != '1'
|
if ENV['SKIP_API'] != '1'
|
||||||
# Build Scaladoc for Java/Scala
|
# Build Scaladoc for Java/Scala
|
||||||
projects = ["core", "examples", "repl", "bagel", "streaming"]
|
projects = ["core", "examples", "repl", "bagel", "streaming", "mllib"]
|
||||||
|
|
||||||
puts "Moving to project root and building scaladoc."
|
puts "Moving to project root and building scaladoc."
|
||||||
curr_dir = pwd
|
curr_dir = pwd
|
||||||
|
|
|
@ -27,8 +27,10 @@ import scala.math.round
|
||||||
import org.jblas.DoubleMatrix
|
import org.jblas.DoubleMatrix
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Logistic Regression using Stochastic Gradient Descent.
|
* Classification model trained using Logistic Regression.
|
||||||
* Based on Matlab code written by John Duchi.
|
*
|
||||||
|
* @param weights Weights computed for every feature.
|
||||||
|
* @param intercept Intercept computed for this model.
|
||||||
*/
|
*/
|
||||||
class LogisticRegressionModel(
|
class LogisticRegressionModel(
|
||||||
override val weights: Array[Double],
|
override val weights: Array[Double],
|
||||||
|
@ -43,7 +45,10 @@ class LogisticRegressionModel(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class LogisticRegressionWithSGD (
|
/**
|
||||||
|
* Train a classification model for Logistic Regression using Stochastic Gradient Descent.
|
||||||
|
*/
|
||||||
|
class LogisticRegressionWithSGD private (
|
||||||
var stepSize: Double,
|
var stepSize: Double,
|
||||||
var numIterations: Int,
|
var numIterations: Int,
|
||||||
var regParam: Double,
|
var regParam: Double,
|
||||||
|
@ -70,10 +75,10 @@ class LogisticRegressionWithSGD (
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Top-level methods for calling Logistic Regression.
|
* Top-level methods for calling Logistic Regression.
|
||||||
* NOTE(shivaram): We use multiple train methods instead of default arguments to support
|
|
||||||
* Java programs.
|
|
||||||
*/
|
*/
|
||||||
object LogisticRegressionWithSGD {
|
object LogisticRegressionWithSGD {
|
||||||
|
// NOTE(shivaram): We use multiple train methods instead of default arguments to support
|
||||||
|
// Java programs.
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
|
* Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
|
||||||
|
|
|
@ -26,7 +26,10 @@ import spark.mllib.util.MLUtils
|
||||||
import org.jblas.DoubleMatrix
|
import org.jblas.DoubleMatrix
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* SVM using Stochastic Gradient Descent.
|
* Model built using SVM.
|
||||||
|
*
|
||||||
|
* @param weights Weights computed for every feature.
|
||||||
|
* @param intercept Intercept computed for this model.
|
||||||
*/
|
*/
|
||||||
class SVMModel(
|
class SVMModel(
|
||||||
override val weights: Array[Double],
|
override val weights: Array[Double],
|
||||||
|
@ -40,6 +43,9 @@ class SVMModel(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Train an SVM using Stochastic Gradient Descent.
|
||||||
|
*/
|
||||||
class SVMWithSGD private (
|
class SVMWithSGD private (
|
||||||
var stepSize: Double,
|
var stepSize: Double,
|
||||||
var numIterations: Int,
|
var numIterations: Int,
|
||||||
|
|
|
@ -19,18 +19,29 @@ package spark.mllib.optimization
|
||||||
|
|
||||||
import org.jblas.DoubleMatrix
|
import org.jblas.DoubleMatrix
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class used to compute the gradient for a loss function, given a single data point.
|
||||||
|
*/
|
||||||
abstract class Gradient extends Serializable {
|
abstract class Gradient extends Serializable {
|
||||||
/**
|
/**
|
||||||
* Compute the gradient for a given row of data.
|
* Compute the gradient and loss given features of a single data point.
|
||||||
*
|
*
|
||||||
* @param data - One row of data. Row matrix of size 1xn where n is the number of features.
|
* @param data - Feature values for one data point. Column matrix of size nx1
|
||||||
|
* where n is the number of features.
|
||||||
* @param label - Label for this data item.
|
* @param label - Label for this data item.
|
||||||
* @param weights - Column matrix containing weights for every feature.
|
* @param weights - Column matrix containing weights for every feature.
|
||||||
|
*
|
||||||
|
* @return A tuple of 2 elements. The first element is a column matrix containing the computed
|
||||||
|
* gradient and the second element is the loss computed at this data point.
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
|
def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
|
||||||
(DoubleMatrix, Double)
|
(DoubleMatrix, Double)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute gradient and loss for a logistic loss function.
|
||||||
|
*/
|
||||||
class LogisticGradient extends Gradient {
|
class LogisticGradient extends Gradient {
|
||||||
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
|
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
|
||||||
(DoubleMatrix, Double) = {
|
(DoubleMatrix, Double) = {
|
||||||
|
@ -49,7 +60,9 @@ class LogisticGradient extends Gradient {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute gradient and loss for a Least-squared loss function.
|
||||||
|
*/
|
||||||
class SquaredGradient extends Gradient {
|
class SquaredGradient extends Gradient {
|
||||||
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
|
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
|
||||||
(DoubleMatrix, Double) = {
|
(DoubleMatrix, Double) = {
|
||||||
|
@ -62,7 +75,9 @@ class SquaredGradient extends Gradient {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute gradient and loss for a Hinge loss function.
|
||||||
|
*/
|
||||||
class HingeGradient extends Gradient {
|
class HingeGradient extends Gradient {
|
||||||
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
|
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
|
||||||
(DoubleMatrix, Double) = {
|
(DoubleMatrix, Double) = {
|
||||||
|
|
|
@ -24,12 +24,17 @@ import org.jblas.DoubleMatrix
|
||||||
|
|
||||||
import scala.collection.mutable.ArrayBuffer
|
import scala.collection.mutable.ArrayBuffer
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class used to solve an optimization problem using Gradient Descent.
|
||||||
|
* @param gradient Gradient function to be used.
|
||||||
|
* @param updater Updater to be used to update weights after every iteration.
|
||||||
|
*/
|
||||||
class GradientDescent(var gradient: Gradient, var updater: Updater) extends Optimizer {
|
class GradientDescent(var gradient: Gradient, var updater: Updater) extends Optimizer {
|
||||||
|
|
||||||
var stepSize: Double = 1.0
|
private var stepSize: Double = 1.0
|
||||||
var numIterations: Int = 100
|
private var numIterations: Int = 100
|
||||||
var regParam: Double = 0.0
|
private var regParam: Double = 0.0
|
||||||
var miniBatchFraction: Double = 1.0
|
private var miniBatchFraction: Double = 1.0
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the step size per-iteration of SGD. Default 1.0.
|
* Set the step size per-iteration of SGD. Default 1.0.
|
||||||
|
@ -97,10 +102,10 @@ class GradientDescent(var gradient: Gradient, var updater: Updater) extends Opti
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Top-level method to run gradient descent.
|
||||||
object GradientDescent extends Logging {
|
object GradientDescent extends Logging {
|
||||||
/**
|
/**
|
||||||
* Run gradient descent in parallel using mini batches.
|
* Run gradient descent in parallel using mini batches.
|
||||||
* Based on Matlab code written by John Duchi.
|
|
||||||
*
|
*
|
||||||
* @param data - Input data for SGD. RDD of form (label, [feature values]).
|
* @param data - Input data for SGD. RDD of form (label, [feature values]).
|
||||||
* @param gradient - Gradient object that will be used to compute the gradient.
|
* @param gradient - Gradient object that will be used to compute the gradient.
|
||||||
|
@ -137,8 +142,8 @@ object GradientDescent extends Logging {
|
||||||
for (i <- 1 to numIterations) {
|
for (i <- 1 to numIterations) {
|
||||||
val (gradientSum, lossSum) = data.sample(false, miniBatchFraction, 42+i).map {
|
val (gradientSum, lossSum) = data.sample(false, miniBatchFraction, 42+i).map {
|
||||||
case (y, features) =>
|
case (y, features) =>
|
||||||
val featuresRow = new DoubleMatrix(features.length, 1, features:_*)
|
val featuresCol = new DoubleMatrix(features.length, 1, features:_*)
|
||||||
val (grad, loss) = gradient.compute(featuresRow, y, weights)
|
val (grad, loss) = gradient.compute(featuresCol, y, weights)
|
||||||
(grad, loss)
|
(grad, loss)
|
||||||
}.reduce((a, b) => (a._1.addi(b._1), a._2 + b._2))
|
}.reduce((a, b) => (a._1.addi(b._1), a._2 + b._2))
|
||||||
|
|
||||||
|
|
|
@ -20,10 +20,14 @@ package spark.mllib.optimization
|
||||||
import scala.math._
|
import scala.math._
|
||||||
import org.jblas.DoubleMatrix
|
import org.jblas.DoubleMatrix
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class used to update weights used in Gradient Descent.
|
||||||
|
*/
|
||||||
abstract class Updater extends Serializable {
|
abstract class Updater extends Serializable {
|
||||||
/**
|
/**
|
||||||
* Compute an updated value for weights given the gradient, stepSize and iteration number.
|
* Compute an updated value for weights given the gradient, stepSize, iteration number and
|
||||||
* Also returns the regularization value computed using the *updated* weights.
|
* regularization parameter. Also returns the regularization value computed using the
|
||||||
|
* *updated* weights.
|
||||||
*
|
*
|
||||||
* @param weightsOld - Column matrix of size nx1 where n is the number of features.
|
* @param weightsOld - Column matrix of size nx1 where n is the number of features.
|
||||||
* @param gradient - Column matrix of size nx1 where n is the number of features.
|
* @param gradient - Column matrix of size nx1 where n is the number of features.
|
||||||
|
@ -38,6 +42,10 @@ abstract class Updater extends Serializable {
|
||||||
regParam: Double): (DoubleMatrix, Double)
|
regParam: Double): (DoubleMatrix, Double)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A simple updater that adaptively adjusts the learning rate the
|
||||||
|
* square root of the number of iterations. Does not perform any regularization.
|
||||||
|
*/
|
||||||
class SimpleUpdater extends Updater {
|
class SimpleUpdater extends Updater {
|
||||||
override def compute(weightsOld: DoubleMatrix, gradient: DoubleMatrix,
|
override def compute(weightsOld: DoubleMatrix, gradient: DoubleMatrix,
|
||||||
stepSize: Double, iter: Int, regParam: Double): (DoubleMatrix, Double) = {
|
stepSize: Double, iter: Int, regParam: Double): (DoubleMatrix, Double) = {
|
||||||
|
@ -48,11 +56,15 @@ class SimpleUpdater extends Updater {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* L1 regularization -- corresponding proximal operator is the soft-thresholding function
|
* Updater that adjusts learning rate and performs L1 regularization.
|
||||||
* That is, each weight component is shrunk towards 0 by shrinkageVal
|
*
|
||||||
|
* The corresponding proximal operator used is the soft-thresholding function.
|
||||||
|
* That is, each weight component is shrunk towards 0 by shrinkageVal.
|
||||||
|
*
|
||||||
* If w > shrinkageVal, set weight component to w-shrinkageVal.
|
* If w > shrinkageVal, set weight component to w-shrinkageVal.
|
||||||
* If w < -shrinkageVal, set weight component to w+shrinkageVal.
|
* If w < -shrinkageVal, set weight component to w+shrinkageVal.
|
||||||
* If -shrinkageVal < w < shrinkageVal, set weight component to 0.
|
* If -shrinkageVal < w < shrinkageVal, set weight component to 0.
|
||||||
|
*
|
||||||
* Equivalently, set weight component to signum(w) * max(0.0, abs(w) - shrinkageVal)
|
* Equivalently, set weight component to signum(w) * max(0.0, abs(w) - shrinkageVal)
|
||||||
*/
|
*/
|
||||||
class L1Updater extends Updater {
|
class L1Updater extends Updater {
|
||||||
|
@ -72,6 +84,9 @@ class L1Updater extends Updater {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Updater that adjusts the learning rate and performs L2 regularization
|
||||||
|
*/
|
||||||
class SquaredL2Updater extends Updater {
|
class SquaredL2Updater extends Updater {
|
||||||
override def compute(weightsOld: DoubleMatrix, gradient: DoubleMatrix,
|
override def compute(weightsOld: DoubleMatrix, gradient: DoubleMatrix,
|
||||||
stepSize: Double, iter: Int, regParam: Double): (DoubleMatrix, Double) = {
|
stepSize: Double, iter: Int, regParam: Double): (DoubleMatrix, Double) = {
|
||||||
|
|
|
@ -22,6 +22,15 @@ import spark.SparkContext._
|
||||||
|
|
||||||
import org.jblas._
|
import org.jblas._
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Model representing the result of matrix factorization.
|
||||||
|
*
|
||||||
|
* @param rank Rank for the features in this model.
|
||||||
|
* @param userFeatures RDD of tuples where each tuple represents the userId and
|
||||||
|
* the features computed for this user.
|
||||||
|
* @param productFeatures RDD of tuples where each tuple represents the productId
|
||||||
|
* and the features computed for this product.
|
||||||
|
*/
|
||||||
class MatrixFactorizationModel(
|
class MatrixFactorizationModel(
|
||||||
val rank: Int,
|
val rank: Int,
|
||||||
val userFeatures: RDD[(Int, Array[Double])],
|
val userFeatures: RDD[(Int, Array[Double])],
|
||||||
|
|
|
@ -24,8 +24,11 @@ import org.jblas.DoubleMatrix
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* GeneralizedLinearModel (GLM) represents a model trained using
|
* GeneralizedLinearModel (GLM) represents a model trained using
|
||||||
* GeneralizedLinearAlgorithm. GLMs consist of a weight vector,
|
* GeneralizedLinearAlgorithm. GLMs consist of a weight vector and
|
||||||
* an intercept.
|
* an intercept.
|
||||||
|
*
|
||||||
|
* @param weights Weights computed for every feature.
|
||||||
|
* @param intercept Intercept computed for this model.
|
||||||
*/
|
*/
|
||||||
abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept: Double)
|
abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept: Double)
|
||||||
extends Serializable {
|
extends Serializable {
|
||||||
|
@ -43,6 +46,12 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
|
||||||
def predictPoint(dataMatrix: DoubleMatrix, weightMatrix: DoubleMatrix,
|
def predictPoint(dataMatrix: DoubleMatrix, weightMatrix: DoubleMatrix,
|
||||||
intercept: Double): Double
|
intercept: Double): Double
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Predict values for the given data set using the model trained.
|
||||||
|
*
|
||||||
|
* @param testData RDD representing data points to be predicted
|
||||||
|
* @return RDD[Double] where each entry contains the corresponding prediction
|
||||||
|
*/
|
||||||
def predict(testData: spark.RDD[Array[Double]]): RDD[Double] = {
|
def predict(testData: spark.RDD[Array[Double]]): RDD[Double] = {
|
||||||
// A small optimization to avoid serializing the entire model. Only the weightsMatrix
|
// A small optimization to avoid serializing the entire model. Only the weightsMatrix
|
||||||
// and intercept is needed.
|
// and intercept is needed.
|
||||||
|
@ -55,6 +64,12 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Predict values for a single data point using the model trained.
|
||||||
|
*
|
||||||
|
* @param testData array representing a single data point
|
||||||
|
* @return Double prediction from the trained model
|
||||||
|
*/
|
||||||
def predict(testData: Array[Double]): Double = {
|
def predict(testData: Array[Double]): Double = {
|
||||||
val dataMat = new DoubleMatrix(1, testData.length, testData:_*)
|
val dataMat = new DoubleMatrix(1, testData.length, testData:_*)
|
||||||
predictPoint(dataMat, weightsMatrix, intercept)
|
predictPoint(dataMat, weightsMatrix, intercept)
|
||||||
|
@ -62,7 +77,7 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* GeneralizedLinearAlgorithm abstracts out the training for all GLMs.
|
* GeneralizedLinearAlgorithm implements methods to train a Genearalized Linear Model (GLM).
|
||||||
* This class should be extended with an Optimizer to create a new GLM.
|
* This class should be extended with an Optimizer to create a new GLM.
|
||||||
*/
|
*/
|
||||||
abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
|
abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
|
||||||
|
@ -70,9 +85,12 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
|
||||||
|
|
||||||
val optimizer: Optimizer
|
val optimizer: Optimizer
|
||||||
|
|
||||||
def createModel(weights: Array[Double], intercept: Double): M
|
/**
|
||||||
|
* Create a model given the weights and intercept
|
||||||
|
*/
|
||||||
|
protected def createModel(weights: Array[Double], intercept: Double): M
|
||||||
|
|
||||||
var addIntercept: Boolean
|
protected var addIntercept: Boolean
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set if the algorithm should add an intercept. Default true.
|
* Set if the algorithm should add an intercept. Default true.
|
||||||
|
@ -82,12 +100,20 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
|
||||||
this
|
this
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run the algorithm with the configured parameters on an input
|
||||||
|
* RDD of LabeledPoint entries.
|
||||||
|
*/
|
||||||
def run(input: RDD[LabeledPoint]) : M = {
|
def run(input: RDD[LabeledPoint]) : M = {
|
||||||
val nfeatures: Int = input.first().features.length
|
val nfeatures: Int = input.first().features.length
|
||||||
val initialWeights = Array.fill(nfeatures)(1.0)
|
val initialWeights = Array.fill(nfeatures)(1.0)
|
||||||
run(input, initialWeights)
|
run(input, initialWeights)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run the algorithm with the configured parameters on an input RDD
|
||||||
|
* of LabeledPoint entries starting from the initial weights provided.
|
||||||
|
*/
|
||||||
def run(input: RDD[LabeledPoint], initialWeights: Array[Double]) : M = {
|
def run(input: RDD[LabeledPoint], initialWeights: Array[Double]) : M = {
|
||||||
|
|
||||||
// Add a extra variable consisting of all 1.0's for the intercept.
|
// Add a extra variable consisting of all 1.0's for the intercept.
|
||||||
|
|
|
@ -24,8 +24,10 @@ import spark.mllib.util.MLUtils
|
||||||
import org.jblas.DoubleMatrix
|
import org.jblas.DoubleMatrix
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Lasso using Stochastic Gradient Descent.
|
* Regression model trained using Lasso.
|
||||||
*
|
*
|
||||||
|
* @param weights Weights computed for every feature.
|
||||||
|
* @param intercept Intercept computed for this model.
|
||||||
*/
|
*/
|
||||||
class LassoModel(
|
class LassoModel(
|
||||||
override val weights: Array[Double],
|
override val weights: Array[Double],
|
||||||
|
@ -39,8 +41,10 @@ class LassoModel(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
class LassoWithSGD (
|
* Train a regression model with L1-regularization using Stochastic Gradient Descent.
|
||||||
|
*/
|
||||||
|
class LassoWithSGD private (
|
||||||
var stepSize: Double,
|
var stepSize: Double,
|
||||||
var numIterations: Int,
|
var numIterations: Int,
|
||||||
var regParam: Double,
|
var regParam: Double,
|
||||||
|
|
|
@ -168,10 +168,10 @@ class RidgeRegression private (var lambdaLow: Double, var lambdaHigh: Double)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Top-level methods for calling Ridge Regression.
|
* Top-level methods for calling Ridge Regression.
|
||||||
* NOTE(shivaram): We use multiple train methods instead of default arguments to support
|
|
||||||
* Java programs.
|
|
||||||
*/
|
*/
|
||||||
object RidgeRegression {
|
object RidgeRegression {
|
||||||
|
// NOTE(shivaram): We use multiple train methods instead of default arguments to support
|
||||||
|
// Java programs.
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Train a ridge regression model given an RDD of (response, features) pairs.
|
* Train a ridge regression model given an RDD of (response, features) pairs.
|
||||||
|
|
|
@ -21,12 +21,16 @@ import scala.util.Random
|
||||||
|
|
||||||
import spark.{RDD, SparkContext}
|
import spark.{RDD, SparkContext}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate test data for KMeans. This class first chooses k cluster centers
|
||||||
|
* from a d-dimensional Gaussian distribution scaled by factor r and then creates a Gaussian
|
||||||
|
* cluster with scale 1 around each center.
|
||||||
|
*/
|
||||||
|
|
||||||
object KMeansDataGenerator {
|
object KMeansDataGenerator {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate an RDD containing test data for KMeans. This function chooses k cluster centers
|
* Generate an RDD containing test data for KMeans.
|
||||||
* from a d-dimensional Gaussian distribution scaled by factor r, then creates a Gaussian
|
|
||||||
* cluster with scale 1 around each center.
|
|
||||||
*
|
*
|
||||||
* @param sc SparkContext to use for creating the RDD
|
* @param sc SparkContext to use for creating the RDD
|
||||||
* @param numPoints Number of points that will be contained in the RDD
|
* @param numPoints Number of points that will be contained in the RDD
|
||||||
|
|
|
@ -1,18 +1,22 @@
|
||||||
package spark.mllib.regression
|
package spark.mllib.util
|
||||||
|
|
||||||
import scala.util.Random
|
import scala.util.Random
|
||||||
|
|
||||||
import org.jblas.DoubleMatrix
|
import org.jblas.DoubleMatrix
|
||||||
|
|
||||||
import spark.{RDD, SparkContext}
|
import spark.{RDD, SparkContext}
|
||||||
import spark.mllib.util.MLUtils
|
import spark.mllib.regression.LabeledPoint
|
||||||
|
|
||||||
object LassoGenerator {
|
/**
|
||||||
|
* Generate sample data used for Lasso Regression. This class generates uniform random values
|
||||||
|
* for the features and adds Gaussian noise with weight 0.1 to generate response variables.
|
||||||
|
*/
|
||||||
|
object LassoDataGenerator {
|
||||||
|
|
||||||
def main(args: Array[String]) {
|
def main(args: Array[String]) {
|
||||||
if (args.length != 5) {
|
if (args.length < 2) {
|
||||||
println("Usage: LassoGenerator " +
|
println("Usage: LassoGenerator " +
|
||||||
"<master> <output_dir> <num_examples> <num_features> <num_partitions>")
|
"<master> <output_dir> [num_examples] [num_features] [num_partitions]")
|
||||||
System.exit(1)
|
System.exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -21,7 +25,6 @@ object LassoGenerator {
|
||||||
val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
|
val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
|
||||||
val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
|
val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
|
||||||
val parts: Int = if (args.length > 4) args(4).toInt else 2
|
val parts: Int = if (args.length > 4) args(4).toInt else 2
|
||||||
val eps = 3
|
|
||||||
|
|
||||||
val sc = new SparkContext(sparkMaster, "LassoGenerator")
|
val sc = new SparkContext(sparkMaster, "LassoGenerator")
|
||||||
|
|
||||||
|
|
|
@ -22,11 +22,15 @@ import scala.util.Random
|
||||||
import spark.{RDD, SparkContext}
|
import spark.{RDD, SparkContext}
|
||||||
import spark.mllib.regression.LabeledPoint
|
import spark.mllib.regression.LabeledPoint
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate test data for LogisticRegression. This class chooses positive labels
|
||||||
|
* with probability `probOne` and scales features for positive examples by `eps`.
|
||||||
|
*/
|
||||||
|
|
||||||
object LogisticRegressionDataGenerator {
|
object LogisticRegressionDataGenerator {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate an RDD containing test data for LogisticRegression. This function chooses
|
* Generate an RDD containing test data for LogisticRegression.
|
||||||
* positive labels with probability `probOne` and scales positive examples by `eps`.
|
|
||||||
*
|
*
|
||||||
* @param sc SparkContext to use for creating the RDD.
|
* @param sc SparkContext to use for creating the RDD.
|
||||||
* @param nexamples Number of examples that will be contained in the RDD.
|
* @param nexamples Number of examples that will be contained in the RDD.
|
||||||
|
|
|
@ -24,18 +24,19 @@ import org.jblas.DoubleMatrix
|
||||||
import spark.mllib.regression.LabeledPoint
|
import spark.mllib.regression.LabeledPoint
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper methods to load and save data
|
* Helper methods to load, save and pre-process data used in ML Lib.
|
||||||
* Data format:
|
|
||||||
* <l>, <f1> <f2> ...
|
|
||||||
* where <f1>, <f2> are feature values in Double and <l> is the corresponding label as Double.
|
|
||||||
*/
|
*/
|
||||||
object MLUtils {
|
object MLUtils {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Load labeled data from a file. The data format used here is
|
||||||
|
* <L>, <f1> <f2> ...
|
||||||
|
* where <f1>, <f2> are feature values in Double and <L> is the corresponding label as Double.
|
||||||
|
*
|
||||||
* @param sc SparkContext
|
* @param sc SparkContext
|
||||||
* @param dir Directory to the input data files.
|
* @param dir Directory to the input data files.
|
||||||
* @return An RDD of tuples. For each tuple, the first element is the label, and the second
|
* @return An RDD of LabeledPoint. Each labeled point has two elements: the first element is
|
||||||
* element represents the feature values (an array of Double).
|
* the label, and the second element represents the feature values (an array of Double).
|
||||||
*/
|
*/
|
||||||
def loadLabeledData(sc: SparkContext, dir: String): RDD[LabeledPoint] = {
|
def loadLabeledData(sc: SparkContext, dir: String): RDD[LabeledPoint] = {
|
||||||
sc.textFile(dir).map { line =>
|
sc.textFile(dir).map { line =>
|
||||||
|
@ -46,6 +47,14 @@ object MLUtils {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Save labeled data to a file. The data format used here is
|
||||||
|
* <L>, <f1> <f2> ...
|
||||||
|
* where <f1>, <f2> are feature values in Double and <L> is the corresponding label as Double.
|
||||||
|
*
|
||||||
|
* @param data An RDD of LabeledPoints containing data to be saved.
|
||||||
|
* @param dir Directory to save the data.
|
||||||
|
*/
|
||||||
def saveLabeledData(data: RDD[LabeledPoint], dir: String) {
|
def saveLabeledData(data: RDD[LabeledPoint], dir: String) {
|
||||||
val dataStr = data.map(x => x.label + "," + x.features.mkString(" "))
|
val dataStr = data.map(x => x.label + "," + x.features.mkString(" "))
|
||||||
dataStr.saveAsTextFile(dir)
|
dataStr.saveAsTextFile(dir)
|
||||||
|
|
|
@ -24,18 +24,24 @@ import org.jblas.DoubleMatrix
|
||||||
import spark.{RDD, SparkContext}
|
import spark.{RDD, SparkContext}
|
||||||
import spark.mllib.regression.LabeledPoint
|
import spark.mllib.regression.LabeledPoint
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate sample data used for RidgeRegression. This class generates
|
||||||
|
* uniformly random values for every feature and adds Gaussian noise with mean `eps` to the
|
||||||
|
* response variable `Y`.
|
||||||
|
*
|
||||||
|
*/
|
||||||
object RidgeRegressionDataGenerator {
|
object RidgeRegressionDataGenerator {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate an RDD containing test data used for RidgeRegression. This function generates
|
* Generate an RDD containing sample data for RidgeRegression.
|
||||||
* uniformly random values for every feature and adds Gaussian noise with mean `eps` to the
|
|
||||||
* response variable `Y`.
|
|
||||||
*
|
*
|
||||||
* @param sc SparkContext to be used for generating the RDD.
|
* @param sc SparkContext to be used for generating the RDD.
|
||||||
* @param nexamples Number of examples that will be contained in the RDD.
|
* @param nexamples Number of examples that will be contained in the RDD.
|
||||||
* @param nfeatures Number of features to generate for each example.
|
* @param nfeatures Number of features to generate for each example.
|
||||||
* @param eps Epsilon factor by which examples are scaled.
|
* @param eps Epsilon factor by which examples are scaled.
|
||||||
* @param nparts Number of partitions in the RDD. Default value is 2.
|
* @param nparts Number of partitions in the RDD. Default value is 2.
|
||||||
|
*
|
||||||
|
* @return RDD of LabeledPoint containing sample data.
|
||||||
*/
|
*/
|
||||||
def generateRidgeRDD(
|
def generateRidgeRDD(
|
||||||
sc: SparkContext,
|
sc: SparkContext,
|
||||||
|
@ -69,9 +75,9 @@ object RidgeRegressionDataGenerator {
|
||||||
}
|
}
|
||||||
|
|
||||||
def main(args: Array[String]) {
|
def main(args: Array[String]) {
|
||||||
if (args.length != 5) {
|
if (args.length < 2) {
|
||||||
println("Usage: RidgeRegressionGenerator " +
|
println("Usage: RidgeRegressionGenerator " +
|
||||||
"<master> <output_dir> <num_examples> <num_features> <num_partitions>")
|
"<master> <output_dir> [num_examples] [num_features] [num_partitions]")
|
||||||
System.exit(1)
|
System.exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,22 +1,23 @@
|
||||||
package spark.mllib.classification
|
package spark.mllib.util
|
||||||
|
|
||||||
import scala.util.Random
|
import scala.util.Random
|
||||||
import scala.math.signum
|
import scala.math.signum
|
||||||
|
|
||||||
import org.jblas.DoubleMatrix
|
|
||||||
|
|
||||||
import spark.{RDD, SparkContext}
|
import spark.{RDD, SparkContext}
|
||||||
import spark.mllib.util.MLUtils
|
|
||||||
|
|
||||||
import org.jblas.DoubleMatrix
|
import org.jblas.DoubleMatrix
|
||||||
import spark.mllib.regression.LabeledPoint
|
import spark.mllib.regression.LabeledPoint
|
||||||
|
|
||||||
object SVMGenerator {
|
/**
|
||||||
|
* Generate sample data used for SVM. This class generates uniform random values
|
||||||
|
* for the features and adds Gaussian noise with weight 0.1 to generate labels.
|
||||||
|
*/
|
||||||
|
object SVMDataGenerator {
|
||||||
|
|
||||||
def main(args: Array[String]) {
|
def main(args: Array[String]) {
|
||||||
if (args.length != 5) {
|
if (args.length < 2) {
|
||||||
println("Usage: SVMGenerator " +
|
println("Usage: SVMGenerator " +
|
||||||
"<master> <output_dir> <num_examples> <num_features> <num_partitions>")
|
"<master> <output_dir> [num_examples] [num_features] [num_partitions]")
|
||||||
System.exit(1)
|
System.exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,7 +26,6 @@ object SVMGenerator {
|
||||||
val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
|
val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
|
||||||
val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
|
val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
|
||||||
val parts: Int = if (args.length > 4) args(4).toInt else 2
|
val parts: Int = if (args.length > 4) args(4).toInt else 2
|
||||||
val eps = 3
|
|
||||||
|
|
||||||
val sc = new SparkContext(sparkMaster, "SVMGenerator")
|
val sc = new SparkContext(sparkMaster, "SVMGenerator")
|
||||||
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче