зеркало из https://github.com/microsoft/spark.git
Clean up scaladoc in ML Lib.
Also build and copy ML Lib scaladoc in Spark docs build. Some more minor cleanup with respect to naming, test locations etc.
This commit is contained in:
Родитель
e5b9ed2833
Коммит
4935a2558b
|
@ -20,7 +20,7 @@ include FileUtils
|
|||
|
||||
if ENV['SKIP_API'] != '1'
|
||||
# Build Scaladoc for Java/Scala
|
||||
projects = ["core", "examples", "repl", "bagel", "streaming"]
|
||||
projects = ["core", "examples", "repl", "bagel", "streaming", "mllib"]
|
||||
|
||||
puts "Moving to project root and building scaladoc."
|
||||
curr_dir = pwd
|
||||
|
|
|
@ -27,8 +27,10 @@ import scala.math.round
|
|||
import org.jblas.DoubleMatrix
|
||||
|
||||
/**
|
||||
* Logistic Regression using Stochastic Gradient Descent.
|
||||
* Based on Matlab code written by John Duchi.
|
||||
* Classification model trained using Logistic Regression.
|
||||
*
|
||||
* @param weights Weights computed for every feature.
|
||||
* @param intercept Intercept computed for this model.
|
||||
*/
|
||||
class LogisticRegressionModel(
|
||||
override val weights: Array[Double],
|
||||
|
@ -43,7 +45,10 @@ class LogisticRegressionModel(
|
|||
}
|
||||
}
|
||||
|
||||
class LogisticRegressionWithSGD (
|
||||
/**
|
||||
* Train a classification model for Logistic Regression using Stochastic Gradient Descent.
|
||||
*/
|
||||
class LogisticRegressionWithSGD private (
|
||||
var stepSize: Double,
|
||||
var numIterations: Int,
|
||||
var regParam: Double,
|
||||
|
@ -70,10 +75,10 @@ class LogisticRegressionWithSGD (
|
|||
|
||||
/**
|
||||
* Top-level methods for calling Logistic Regression.
|
||||
* NOTE(shivaram): We use multiple train methods instead of default arguments to support
|
||||
* Java programs.
|
||||
*/
|
||||
object LogisticRegressionWithSGD {
|
||||
// NOTE(shivaram): We use multiple train methods instead of default arguments to support
|
||||
// Java programs.
|
||||
|
||||
/**
|
||||
* Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
|
||||
|
|
|
@ -26,7 +26,10 @@ import spark.mllib.util.MLUtils
|
|||
import org.jblas.DoubleMatrix
|
||||
|
||||
/**
|
||||
* SVM using Stochastic Gradient Descent.
|
||||
* Model built using SVM.
|
||||
*
|
||||
* @param weights Weights computed for every feature.
|
||||
* @param intercept Intercept computed for this model.
|
||||
*/
|
||||
class SVMModel(
|
||||
override val weights: Array[Double],
|
||||
|
@ -40,6 +43,9 @@ class SVMModel(
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Train an SVM using Stochastic Gradient Descent.
|
||||
*/
|
||||
class SVMWithSGD private (
|
||||
var stepSize: Double,
|
||||
var numIterations: Int,
|
||||
|
|
|
@ -19,18 +19,29 @@ package spark.mllib.optimization
|
|||
|
||||
import org.jblas.DoubleMatrix
|
||||
|
||||
/**
|
||||
* Class used to compute the gradient for a loss function, given a single data point.
|
||||
*/
|
||||
abstract class Gradient extends Serializable {
|
||||
/**
|
||||
* Compute the gradient for a given row of data.
|
||||
* Compute the gradient and loss given features of a single data point.
|
||||
*
|
||||
* @param data - One row of data. Row matrix of size 1xn where n is the number of features.
|
||||
* @param data - Feature values for one data point. Column matrix of size nx1
|
||||
* where n is the number of features.
|
||||
* @param label - Label for this data item.
|
||||
* @param weights - Column matrix containing weights for every feature.
|
||||
*
|
||||
* @return A tuple of 2 elements. The first element is a column matrix containing the computed
|
||||
* gradient and the second element is the loss computed at this data point.
|
||||
*
|
||||
*/
|
||||
def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
|
||||
(DoubleMatrix, Double)
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute gradient and loss for a logistic loss function.
|
||||
*/
|
||||
class LogisticGradient extends Gradient {
|
||||
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
|
||||
(DoubleMatrix, Double) = {
|
||||
|
@ -49,7 +60,9 @@ class LogisticGradient extends Gradient {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compute gradient and loss for a Least-squared loss function.
|
||||
*/
|
||||
class SquaredGradient extends Gradient {
|
||||
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
|
||||
(DoubleMatrix, Double) = {
|
||||
|
@ -62,7 +75,9 @@ class SquaredGradient extends Gradient {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compute gradient and loss for a Hinge loss function.
|
||||
*/
|
||||
class HingeGradient extends Gradient {
|
||||
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
|
||||
(DoubleMatrix, Double) = {
|
||||
|
|
|
@ -24,12 +24,17 @@ import org.jblas.DoubleMatrix
|
|||
|
||||
import scala.collection.mutable.ArrayBuffer
|
||||
|
||||
/**
|
||||
* Class used to solve an optimization problem using Gradient Descent.
|
||||
* @param gradient Gradient function to be used.
|
||||
* @param updater Updater to be used to update weights after every iteration.
|
||||
*/
|
||||
class GradientDescent(var gradient: Gradient, var updater: Updater) extends Optimizer {
|
||||
|
||||
var stepSize: Double = 1.0
|
||||
var numIterations: Int = 100
|
||||
var regParam: Double = 0.0
|
||||
var miniBatchFraction: Double = 1.0
|
||||
private var stepSize: Double = 1.0
|
||||
private var numIterations: Int = 100
|
||||
private var regParam: Double = 0.0
|
||||
private var miniBatchFraction: Double = 1.0
|
||||
|
||||
/**
|
||||
* Set the step size per-iteration of SGD. Default 1.0.
|
||||
|
@ -97,10 +102,10 @@ class GradientDescent(var gradient: Gradient, var updater: Updater) extends Opti
|
|||
|
||||
}
|
||||
|
||||
// Top-level method to run gradient descent.
|
||||
object GradientDescent extends Logging {
|
||||
/**
|
||||
* Run gradient descent in parallel using mini batches.
|
||||
* Based on Matlab code written by John Duchi.
|
||||
*
|
||||
* @param data - Input data for SGD. RDD of form (label, [feature values]).
|
||||
* @param gradient - Gradient object that will be used to compute the gradient.
|
||||
|
@ -137,8 +142,8 @@ object GradientDescent extends Logging {
|
|||
for (i <- 1 to numIterations) {
|
||||
val (gradientSum, lossSum) = data.sample(false, miniBatchFraction, 42+i).map {
|
||||
case (y, features) =>
|
||||
val featuresRow = new DoubleMatrix(features.length, 1, features:_*)
|
||||
val (grad, loss) = gradient.compute(featuresRow, y, weights)
|
||||
val featuresCol = new DoubleMatrix(features.length, 1, features:_*)
|
||||
val (grad, loss) = gradient.compute(featuresCol, y, weights)
|
||||
(grad, loss)
|
||||
}.reduce((a, b) => (a._1.addi(b._1), a._2 + b._2))
|
||||
|
||||
|
|
|
@ -20,10 +20,14 @@ package spark.mllib.optimization
|
|||
import scala.math._
|
||||
import org.jblas.DoubleMatrix
|
||||
|
||||
/**
|
||||
* Class used to update weights used in Gradient Descent.
|
||||
*/
|
||||
abstract class Updater extends Serializable {
|
||||
/**
|
||||
* Compute an updated value for weights given the gradient, stepSize and iteration number.
|
||||
* Also returns the regularization value computed using the *updated* weights.
|
||||
* Compute an updated value for weights given the gradient, stepSize, iteration number and
|
||||
* regularization parameter. Also returns the regularization value computed using the
|
||||
* *updated* weights.
|
||||
*
|
||||
* @param weightsOld - Column matrix of size nx1 where n is the number of features.
|
||||
* @param gradient - Column matrix of size nx1 where n is the number of features.
|
||||
|
@ -38,6 +42,10 @@ abstract class Updater extends Serializable {
|
|||
regParam: Double): (DoubleMatrix, Double)
|
||||
}
|
||||
|
||||
/**
|
||||
* A simple updater that adaptively adjusts the learning rate the
|
||||
* square root of the number of iterations. Does not perform any regularization.
|
||||
*/
|
||||
class SimpleUpdater extends Updater {
|
||||
override def compute(weightsOld: DoubleMatrix, gradient: DoubleMatrix,
|
||||
stepSize: Double, iter: Int, regParam: Double): (DoubleMatrix, Double) = {
|
||||
|
@ -48,11 +56,15 @@ class SimpleUpdater extends Updater {
|
|||
}
|
||||
|
||||
/**
|
||||
* L1 regularization -- corresponding proximal operator is the soft-thresholding function
|
||||
* That is, each weight component is shrunk towards 0 by shrinkageVal
|
||||
* Updater that adjusts learning rate and performs L1 regularization.
|
||||
*
|
||||
* The corresponding proximal operator used is the soft-thresholding function.
|
||||
* That is, each weight component is shrunk towards 0 by shrinkageVal.
|
||||
*
|
||||
* If w > shrinkageVal, set weight component to w-shrinkageVal.
|
||||
* If w < -shrinkageVal, set weight component to w+shrinkageVal.
|
||||
* If -shrinkageVal < w < shrinkageVal, set weight component to 0.
|
||||
*
|
||||
* Equivalently, set weight component to signum(w) * max(0.0, abs(w) - shrinkageVal)
|
||||
*/
|
||||
class L1Updater extends Updater {
|
||||
|
@ -72,6 +84,9 @@ class L1Updater extends Updater {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Updater that adjusts the learning rate and performs L2 regularization
|
||||
*/
|
||||
class SquaredL2Updater extends Updater {
|
||||
override def compute(weightsOld: DoubleMatrix, gradient: DoubleMatrix,
|
||||
stepSize: Double, iter: Int, regParam: Double): (DoubleMatrix, Double) = {
|
||||
|
|
|
@ -22,6 +22,15 @@ import spark.SparkContext._
|
|||
|
||||
import org.jblas._
|
||||
|
||||
/**
|
||||
* Model representing the result of matrix factorization.
|
||||
*
|
||||
* @param rank Rank for the features in this model.
|
||||
* @param userFeatures RDD of tuples where each tuple represents the userId and
|
||||
* the features computed for this user.
|
||||
* @param productFeatures RDD of tuples where each tuple represents the productId
|
||||
* and the features computed for this product.
|
||||
*/
|
||||
class MatrixFactorizationModel(
|
||||
val rank: Int,
|
||||
val userFeatures: RDD[(Int, Array[Double])],
|
||||
|
|
|
@ -24,8 +24,11 @@ import org.jblas.DoubleMatrix
|
|||
|
||||
/**
|
||||
* GeneralizedLinearModel (GLM) represents a model trained using
|
||||
* GeneralizedLinearAlgorithm. GLMs consist of a weight vector,
|
||||
* GeneralizedLinearAlgorithm. GLMs consist of a weight vector and
|
||||
* an intercept.
|
||||
*
|
||||
* @param weights Weights computed for every feature.
|
||||
* @param intercept Intercept computed for this model.
|
||||
*/
|
||||
abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept: Double)
|
||||
extends Serializable {
|
||||
|
@ -43,6 +46,12 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
|
|||
def predictPoint(dataMatrix: DoubleMatrix, weightMatrix: DoubleMatrix,
|
||||
intercept: Double): Double
|
||||
|
||||
/**
|
||||
* Predict values for the given data set using the model trained.
|
||||
*
|
||||
* @param testData RDD representing data points to be predicted
|
||||
* @return RDD[Double] where each entry contains the corresponding prediction
|
||||
*/
|
||||
def predict(testData: spark.RDD[Array[Double]]): RDD[Double] = {
|
||||
// A small optimization to avoid serializing the entire model. Only the weightsMatrix
|
||||
// and intercept is needed.
|
||||
|
@ -55,6 +64,12 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Predict values for a single data point using the model trained.
|
||||
*
|
||||
* @param testData array representing a single data point
|
||||
* @return Double prediction from the trained model
|
||||
*/
|
||||
def predict(testData: Array[Double]): Double = {
|
||||
val dataMat = new DoubleMatrix(1, testData.length, testData:_*)
|
||||
predictPoint(dataMat, weightsMatrix, intercept)
|
||||
|
@ -62,7 +77,7 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
|
|||
}
|
||||
|
||||
/**
|
||||
* GeneralizedLinearAlgorithm abstracts out the training for all GLMs.
|
||||
* GeneralizedLinearAlgorithm implements methods to train a Genearalized Linear Model (GLM).
|
||||
* This class should be extended with an Optimizer to create a new GLM.
|
||||
*/
|
||||
abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
|
||||
|
@ -70,9 +85,12 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
|
|||
|
||||
val optimizer: Optimizer
|
||||
|
||||
def createModel(weights: Array[Double], intercept: Double): M
|
||||
/**
|
||||
* Create a model given the weights and intercept
|
||||
*/
|
||||
protected def createModel(weights: Array[Double], intercept: Double): M
|
||||
|
||||
var addIntercept: Boolean
|
||||
protected var addIntercept: Boolean
|
||||
|
||||
/**
|
||||
* Set if the algorithm should add an intercept. Default true.
|
||||
|
@ -82,12 +100,20 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
|
|||
this
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the algorithm with the configured parameters on an input
|
||||
* RDD of LabeledPoint entries.
|
||||
*/
|
||||
def run(input: RDD[LabeledPoint]) : M = {
|
||||
val nfeatures: Int = input.first().features.length
|
||||
val initialWeights = Array.fill(nfeatures)(1.0)
|
||||
run(input, initialWeights)
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the algorithm with the configured parameters on an input RDD
|
||||
* of LabeledPoint entries starting from the initial weights provided.
|
||||
*/
|
||||
def run(input: RDD[LabeledPoint], initialWeights: Array[Double]) : M = {
|
||||
|
||||
// Add a extra variable consisting of all 1.0's for the intercept.
|
||||
|
|
|
@ -24,8 +24,10 @@ import spark.mllib.util.MLUtils
|
|||
import org.jblas.DoubleMatrix
|
||||
|
||||
/**
|
||||
* Lasso using Stochastic Gradient Descent.
|
||||
* Regression model trained using Lasso.
|
||||
*
|
||||
* @param weights Weights computed for every feature.
|
||||
* @param intercept Intercept computed for this model.
|
||||
*/
|
||||
class LassoModel(
|
||||
override val weights: Array[Double],
|
||||
|
@ -39,8 +41,10 @@ class LassoModel(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
class LassoWithSGD (
|
||||
/**
|
||||
* Train a regression model with L1-regularization using Stochastic Gradient Descent.
|
||||
*/
|
||||
class LassoWithSGD private (
|
||||
var stepSize: Double,
|
||||
var numIterations: Int,
|
||||
var regParam: Double,
|
||||
|
|
|
@ -168,10 +168,10 @@ class RidgeRegression private (var lambdaLow: Double, var lambdaHigh: Double)
|
|||
|
||||
/**
|
||||
* Top-level methods for calling Ridge Regression.
|
||||
* NOTE(shivaram): We use multiple train methods instead of default arguments to support
|
||||
* Java programs.
|
||||
*/
|
||||
object RidgeRegression {
|
||||
// NOTE(shivaram): We use multiple train methods instead of default arguments to support
|
||||
// Java programs.
|
||||
|
||||
/**
|
||||
* Train a ridge regression model given an RDD of (response, features) pairs.
|
||||
|
|
|
@ -21,12 +21,16 @@ import scala.util.Random
|
|||
|
||||
import spark.{RDD, SparkContext}
|
||||
|
||||
/**
|
||||
* Generate test data for KMeans. This class first chooses k cluster centers
|
||||
* from a d-dimensional Gaussian distribution scaled by factor r and then creates a Gaussian
|
||||
* cluster with scale 1 around each center.
|
||||
*/
|
||||
|
||||
object KMeansDataGenerator {
|
||||
|
||||
/**
|
||||
* Generate an RDD containing test data for KMeans. This function chooses k cluster centers
|
||||
* from a d-dimensional Gaussian distribution scaled by factor r, then creates a Gaussian
|
||||
* cluster with scale 1 around each center.
|
||||
* Generate an RDD containing test data for KMeans.
|
||||
*
|
||||
* @param sc SparkContext to use for creating the RDD
|
||||
* @param numPoints Number of points that will be contained in the RDD
|
||||
|
|
|
@ -1,18 +1,22 @@
|
|||
package spark.mllib.regression
|
||||
package spark.mllib.util
|
||||
|
||||
import scala.util.Random
|
||||
|
||||
import org.jblas.DoubleMatrix
|
||||
|
||||
import spark.{RDD, SparkContext}
|
||||
import spark.mllib.util.MLUtils
|
||||
import spark.mllib.regression.LabeledPoint
|
||||
|
||||
object LassoGenerator {
|
||||
/**
|
||||
* Generate sample data used for Lasso Regression. This class generates uniform random values
|
||||
* for the features and adds Gaussian noise with weight 0.1 to generate response variables.
|
||||
*/
|
||||
object LassoDataGenerator {
|
||||
|
||||
def main(args: Array[String]) {
|
||||
if (args.length != 5) {
|
||||
if (args.length < 2) {
|
||||
println("Usage: LassoGenerator " +
|
||||
"<master> <output_dir> <num_examples> <num_features> <num_partitions>")
|
||||
"<master> <output_dir> [num_examples] [num_features] [num_partitions]")
|
||||
System.exit(1)
|
||||
}
|
||||
|
||||
|
@ -21,7 +25,6 @@ object LassoGenerator {
|
|||
val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
|
||||
val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
|
||||
val parts: Int = if (args.length > 4) args(4).toInt else 2
|
||||
val eps = 3
|
||||
|
||||
val sc = new SparkContext(sparkMaster, "LassoGenerator")
|
||||
|
||||
|
|
|
@ -22,11 +22,15 @@ import scala.util.Random
|
|||
import spark.{RDD, SparkContext}
|
||||
import spark.mllib.regression.LabeledPoint
|
||||
|
||||
/**
|
||||
* Generate test data for LogisticRegression. This class chooses positive labels
|
||||
* with probability `probOne` and scales features for positive examples by `eps`.
|
||||
*/
|
||||
|
||||
object LogisticRegressionDataGenerator {
|
||||
|
||||
/**
|
||||
* Generate an RDD containing test data for LogisticRegression. This function chooses
|
||||
* positive labels with probability `probOne` and scales positive examples by `eps`.
|
||||
* Generate an RDD containing test data for LogisticRegression.
|
||||
*
|
||||
* @param sc SparkContext to use for creating the RDD.
|
||||
* @param nexamples Number of examples that will be contained in the RDD.
|
||||
|
|
|
@ -24,18 +24,19 @@ import org.jblas.DoubleMatrix
|
|||
import spark.mllib.regression.LabeledPoint
|
||||
|
||||
/**
|
||||
* Helper methods to load and save data
|
||||
* Data format:
|
||||
* <l>, <f1> <f2> ...
|
||||
* where <f1>, <f2> are feature values in Double and <l> is the corresponding label as Double.
|
||||
* Helper methods to load, save and pre-process data used in ML Lib.
|
||||
*/
|
||||
object MLUtils {
|
||||
|
||||
/**
|
||||
* Load labeled data from a file. The data format used here is
|
||||
* <L>, <f1> <f2> ...
|
||||
* where <f1>, <f2> are feature values in Double and <L> is the corresponding label as Double.
|
||||
*
|
||||
* @param sc SparkContext
|
||||
* @param dir Directory to the input data files.
|
||||
* @return An RDD of tuples. For each tuple, the first element is the label, and the second
|
||||
* element represents the feature values (an array of Double).
|
||||
* @return An RDD of LabeledPoint. Each labeled point has two elements: the first element is
|
||||
* the label, and the second element represents the feature values (an array of Double).
|
||||
*/
|
||||
def loadLabeledData(sc: SparkContext, dir: String): RDD[LabeledPoint] = {
|
||||
sc.textFile(dir).map { line =>
|
||||
|
@ -46,6 +47,14 @@ object MLUtils {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save labeled data to a file. The data format used here is
|
||||
* <L>, <f1> <f2> ...
|
||||
* where <f1>, <f2> are feature values in Double and <L> is the corresponding label as Double.
|
||||
*
|
||||
* @param data An RDD of LabeledPoints containing data to be saved.
|
||||
* @param dir Directory to save the data.
|
||||
*/
|
||||
def saveLabeledData(data: RDD[LabeledPoint], dir: String) {
|
||||
val dataStr = data.map(x => x.label + "," + x.features.mkString(" "))
|
||||
dataStr.saveAsTextFile(dir)
|
||||
|
|
|
@ -24,18 +24,24 @@ import org.jblas.DoubleMatrix
|
|||
import spark.{RDD, SparkContext}
|
||||
import spark.mllib.regression.LabeledPoint
|
||||
|
||||
/**
|
||||
* Generate sample data used for RidgeRegression. This class generates
|
||||
* uniformly random values for every feature and adds Gaussian noise with mean `eps` to the
|
||||
* response variable `Y`.
|
||||
*
|
||||
*/
|
||||
object RidgeRegressionDataGenerator {
|
||||
|
||||
/**
|
||||
* Generate an RDD containing test data used for RidgeRegression. This function generates
|
||||
* uniformly random values for every feature and adds Gaussian noise with mean `eps` to the
|
||||
* response variable `Y`.
|
||||
* Generate an RDD containing sample data for RidgeRegression.
|
||||
*
|
||||
* @param sc SparkContext to be used for generating the RDD.
|
||||
* @param nexamples Number of examples that will be contained in the RDD.
|
||||
* @param nfeatures Number of features to generate for each example.
|
||||
* @param eps Epsilon factor by which examples are scaled.
|
||||
* @param nparts Number of partitions in the RDD. Default value is 2.
|
||||
*
|
||||
* @return RDD of LabeledPoint containing sample data.
|
||||
*/
|
||||
def generateRidgeRDD(
|
||||
sc: SparkContext,
|
||||
|
@ -69,9 +75,9 @@ object RidgeRegressionDataGenerator {
|
|||
}
|
||||
|
||||
def main(args: Array[String]) {
|
||||
if (args.length != 5) {
|
||||
if (args.length < 2) {
|
||||
println("Usage: RidgeRegressionGenerator " +
|
||||
"<master> <output_dir> <num_examples> <num_features> <num_partitions>")
|
||||
"<master> <output_dir> [num_examples] [num_features] [num_partitions]")
|
||||
System.exit(1)
|
||||
}
|
||||
|
||||
|
|
|
@ -1,22 +1,23 @@
|
|||
package spark.mllib.classification
|
||||
package spark.mllib.util
|
||||
|
||||
import scala.util.Random
|
||||
import scala.math.signum
|
||||
|
||||
import org.jblas.DoubleMatrix
|
||||
|
||||
import spark.{RDD, SparkContext}
|
||||
import spark.mllib.util.MLUtils
|
||||
|
||||
import org.jblas.DoubleMatrix
|
||||
import spark.mllib.regression.LabeledPoint
|
||||
|
||||
object SVMGenerator {
|
||||
/**
|
||||
* Generate sample data used for SVM. This class generates uniform random values
|
||||
* for the features and adds Gaussian noise with weight 0.1 to generate labels.
|
||||
*/
|
||||
object SVMDataGenerator {
|
||||
|
||||
def main(args: Array[String]) {
|
||||
if (args.length != 5) {
|
||||
if (args.length < 2) {
|
||||
println("Usage: SVMGenerator " +
|
||||
"<master> <output_dir> <num_examples> <num_features> <num_partitions>")
|
||||
"<master> <output_dir> [num_examples] [num_features] [num_partitions]")
|
||||
System.exit(1)
|
||||
}
|
||||
|
||||
|
@ -25,7 +26,6 @@ object SVMGenerator {
|
|||
val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
|
||||
val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
|
||||
val parts: Int = if (args.length > 4) args(4).toInt else 2
|
||||
val eps = 3
|
||||
|
||||
val sc = new SparkContext(sparkMaster, "SVMGenerator")
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче