зеркало из https://github.com/microsoft/spark.git
Merge pull request #809 from shivaram/sgd-cleanup
Clean up scaladoc in ML Lib.
This commit is contained in:
Коммит
4346f0a1e9
|
@ -74,6 +74,7 @@
|
|||
<li><a href="api/core/index.html">Spark Java/Scala (Scaladoc)</a></li>
|
||||
<li><a href="api/pyspark/index.html">Spark Python (Epydoc)</a></li>
|
||||
<li><a href="api/streaming/index.html">Spark Streaming Java/Scala (Scaladoc) </a></li>
|
||||
<li><a href="api/mllib/index.html">Spark ML Library (Scaladoc) </a></li>
|
||||
</ul>
|
||||
</li>
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ include FileUtils
|
|||
|
||||
if ENV['SKIP_API'] != '1'
|
||||
# Build Scaladoc for Java/Scala
|
||||
projects = ["core", "examples", "repl", "bagel", "streaming"]
|
||||
projects = ["core", "examples", "repl", "bagel", "streaming", "mllib"]
|
||||
|
||||
puts "Moving to project root and building scaladoc."
|
||||
curr_dir = pwd
|
||||
|
|
|
@ -27,8 +27,10 @@ import scala.math.round
|
|||
import org.jblas.DoubleMatrix
|
||||
|
||||
/**
|
||||
* Logistic Regression using Stochastic Gradient Descent.
|
||||
* Based on Matlab code written by John Duchi.
|
||||
* Classification model trained using Logistic Regression.
|
||||
*
|
||||
* @param weights Weights computed for every feature.
|
||||
* @param intercept Intercept computed for this model.
|
||||
*/
|
||||
class LogisticRegressionModel(
|
||||
override val weights: Array[Double],
|
||||
|
@ -43,7 +45,10 @@ class LogisticRegressionModel(
|
|||
}
|
||||
}
|
||||
|
||||
class LogisticRegressionWithSGD (
|
||||
/**
|
||||
* Train a classification model for Logistic Regression using Stochastic Gradient Descent.
|
||||
*/
|
||||
class LogisticRegressionWithSGD private (
|
||||
var stepSize: Double,
|
||||
var numIterations: Int,
|
||||
var regParam: Double,
|
||||
|
@ -70,10 +75,10 @@ class LogisticRegressionWithSGD (
|
|||
|
||||
/**
|
||||
* Top-level methods for calling Logistic Regression.
|
||||
* NOTE(shivaram): We use multiple train methods instead of default arguments to support
|
||||
* Java programs.
|
||||
*/
|
||||
object LogisticRegressionWithSGD {
|
||||
// NOTE(shivaram): We use multiple train methods instead of default arguments to support
|
||||
// Java programs.
|
||||
|
||||
/**
|
||||
* Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
|
||||
|
|
|
@ -26,7 +26,10 @@ import spark.mllib.util.MLUtils
|
|||
import org.jblas.DoubleMatrix
|
||||
|
||||
/**
|
||||
* SVM using Stochastic Gradient Descent.
|
||||
* Model built using SVM.
|
||||
*
|
||||
* @param weights Weights computed for every feature.
|
||||
* @param intercept Intercept computed for this model.
|
||||
*/
|
||||
class SVMModel(
|
||||
override val weights: Array[Double],
|
||||
|
@ -40,6 +43,9 @@ class SVMModel(
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Train an SVM using Stochastic Gradient Descent.
|
||||
*/
|
||||
class SVMWithSGD private (
|
||||
var stepSize: Double,
|
||||
var numIterations: Int,
|
||||
|
|
|
@ -19,18 +19,29 @@ package spark.mllib.optimization
|
|||
|
||||
import org.jblas.DoubleMatrix
|
||||
|
||||
/**
|
||||
* Class used to compute the gradient for a loss function, given a single data point.
|
||||
*/
|
||||
abstract class Gradient extends Serializable {
|
||||
/**
|
||||
* Compute the gradient for a given row of data.
|
||||
* Compute the gradient and loss given features of a single data point.
|
||||
*
|
||||
* @param data - One row of data. Row matrix of size 1xn where n is the number of features.
|
||||
* @param data - Feature values for one data point. Column matrix of size nx1
|
||||
* where n is the number of features.
|
||||
* @param label - Label for this data item.
|
||||
* @param weights - Column matrix containing weights for every feature.
|
||||
*
|
||||
* @return A tuple of 2 elements. The first element is a column matrix containing the computed
|
||||
* gradient and the second element is the loss computed at this data point.
|
||||
*
|
||||
*/
|
||||
def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
|
||||
(DoubleMatrix, Double)
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute gradient and loss for a logistic loss function.
|
||||
*/
|
||||
class LogisticGradient extends Gradient {
|
||||
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
|
||||
(DoubleMatrix, Double) = {
|
||||
|
@ -49,7 +60,9 @@ class LogisticGradient extends Gradient {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compute gradient and loss for a Least-squared loss function.
|
||||
*/
|
||||
class SquaredGradient extends Gradient {
|
||||
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
|
||||
(DoubleMatrix, Double) = {
|
||||
|
@ -62,7 +75,9 @@ class SquaredGradient extends Gradient {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compute gradient and loss for a Hinge loss function.
|
||||
*/
|
||||
class HingeGradient extends Gradient {
|
||||
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
|
||||
(DoubleMatrix, Double) = {
|
||||
|
|
|
@ -24,12 +24,17 @@ import org.jblas.DoubleMatrix
|
|||
|
||||
import scala.collection.mutable.ArrayBuffer
|
||||
|
||||
/**
|
||||
* Class used to solve an optimization problem using Gradient Descent.
|
||||
* @param gradient Gradient function to be used.
|
||||
* @param updater Updater to be used to update weights after every iteration.
|
||||
*/
|
||||
class GradientDescent(var gradient: Gradient, var updater: Updater) extends Optimizer {
|
||||
|
||||
var stepSize: Double = 1.0
|
||||
var numIterations: Int = 100
|
||||
var regParam: Double = 0.0
|
||||
var miniBatchFraction: Double = 1.0
|
||||
private var stepSize: Double = 1.0
|
||||
private var numIterations: Int = 100
|
||||
private var regParam: Double = 0.0
|
||||
private var miniBatchFraction: Double = 1.0
|
||||
|
||||
/**
|
||||
* Set the step size per-iteration of SGD. Default 1.0.
|
||||
|
@ -97,10 +102,10 @@ class GradientDescent(var gradient: Gradient, var updater: Updater) extends Opti
|
|||
|
||||
}
|
||||
|
||||
// Top-level method to run gradient descent.
|
||||
object GradientDescent extends Logging {
|
||||
/**
|
||||
* Run gradient descent in parallel using mini batches.
|
||||
* Based on Matlab code written by John Duchi.
|
||||
*
|
||||
* @param data - Input data for SGD. RDD of form (label, [feature values]).
|
||||
* @param gradient - Gradient object that will be used to compute the gradient.
|
||||
|
@ -137,8 +142,8 @@ object GradientDescent extends Logging {
|
|||
for (i <- 1 to numIterations) {
|
||||
val (gradientSum, lossSum) = data.sample(false, miniBatchFraction, 42+i).map {
|
||||
case (y, features) =>
|
||||
val featuresRow = new DoubleMatrix(features.length, 1, features:_*)
|
||||
val (grad, loss) = gradient.compute(featuresRow, y, weights)
|
||||
val featuresCol = new DoubleMatrix(features.length, 1, features:_*)
|
||||
val (grad, loss) = gradient.compute(featuresCol, y, weights)
|
||||
(grad, loss)
|
||||
}.reduce((a, b) => (a._1.addi(b._1), a._2 + b._2))
|
||||
|
||||
|
|
|
@ -20,10 +20,14 @@ package spark.mllib.optimization
|
|||
import scala.math._
|
||||
import org.jblas.DoubleMatrix
|
||||
|
||||
/**
|
||||
* Class used to update weights used in Gradient Descent.
|
||||
*/
|
||||
abstract class Updater extends Serializable {
|
||||
/**
|
||||
* Compute an updated value for weights given the gradient, stepSize and iteration number.
|
||||
* Also returns the regularization value computed using the *updated* weights.
|
||||
* Compute an updated value for weights given the gradient, stepSize, iteration number and
|
||||
* regularization parameter. Also returns the regularization value computed using the
|
||||
* *updated* weights.
|
||||
*
|
||||
* @param weightsOld - Column matrix of size nx1 where n is the number of features.
|
||||
* @param gradient - Column matrix of size nx1 where n is the number of features.
|
||||
|
@ -38,6 +42,10 @@ abstract class Updater extends Serializable {
|
|||
regParam: Double): (DoubleMatrix, Double)
|
||||
}
|
||||
|
||||
/**
|
||||
* A simple updater that adaptively adjusts the learning rate the
|
||||
* square root of the number of iterations. Does not perform any regularization.
|
||||
*/
|
||||
class SimpleUpdater extends Updater {
|
||||
override def compute(weightsOld: DoubleMatrix, gradient: DoubleMatrix,
|
||||
stepSize: Double, iter: Int, regParam: Double): (DoubleMatrix, Double) = {
|
||||
|
@ -48,11 +56,15 @@ class SimpleUpdater extends Updater {
|
|||
}
|
||||
|
||||
/**
|
||||
* L1 regularization -- corresponding proximal operator is the soft-thresholding function
|
||||
* That is, each weight component is shrunk towards 0 by shrinkageVal
|
||||
* Updater that adjusts learning rate and performs L1 regularization.
|
||||
*
|
||||
* The corresponding proximal operator used is the soft-thresholding function.
|
||||
* That is, each weight component is shrunk towards 0 by shrinkageVal.
|
||||
*
|
||||
* If w > shrinkageVal, set weight component to w-shrinkageVal.
|
||||
* If w < -shrinkageVal, set weight component to w+shrinkageVal.
|
||||
* If -shrinkageVal < w < shrinkageVal, set weight component to 0.
|
||||
*
|
||||
* Equivalently, set weight component to signum(w) * max(0.0, abs(w) - shrinkageVal)
|
||||
*/
|
||||
class L1Updater extends Updater {
|
||||
|
@ -72,6 +84,9 @@ class L1Updater extends Updater {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Updater that adjusts the learning rate and performs L2 regularization
|
||||
*/
|
||||
class SquaredL2Updater extends Updater {
|
||||
override def compute(weightsOld: DoubleMatrix, gradient: DoubleMatrix,
|
||||
stepSize: Double, iter: Int, regParam: Double): (DoubleMatrix, Double) = {
|
||||
|
|
|
@ -22,6 +22,15 @@ import spark.SparkContext._
|
|||
|
||||
import org.jblas._
|
||||
|
||||
/**
|
||||
* Model representing the result of matrix factorization.
|
||||
*
|
||||
* @param rank Rank for the features in this model.
|
||||
* @param userFeatures RDD of tuples where each tuple represents the userId and
|
||||
* the features computed for this user.
|
||||
* @param productFeatures RDD of tuples where each tuple represents the productId
|
||||
* and the features computed for this product.
|
||||
*/
|
||||
class MatrixFactorizationModel(
|
||||
val rank: Int,
|
||||
val userFeatures: RDD[(Int, Array[Double])],
|
||||
|
|
|
@ -24,8 +24,11 @@ import org.jblas.DoubleMatrix
|
|||
|
||||
/**
|
||||
* GeneralizedLinearModel (GLM) represents a model trained using
|
||||
* GeneralizedLinearAlgorithm. GLMs consist of a weight vector,
|
||||
* GeneralizedLinearAlgorithm. GLMs consist of a weight vector and
|
||||
* an intercept.
|
||||
*
|
||||
* @param weights Weights computed for every feature.
|
||||
* @param intercept Intercept computed for this model.
|
||||
*/
|
||||
abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept: Double)
|
||||
extends Serializable {
|
||||
|
@ -43,6 +46,12 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
|
|||
def predictPoint(dataMatrix: DoubleMatrix, weightMatrix: DoubleMatrix,
|
||||
intercept: Double): Double
|
||||
|
||||
/**
|
||||
* Predict values for the given data set using the model trained.
|
||||
*
|
||||
* @param testData RDD representing data points to be predicted
|
||||
* @return RDD[Double] where each entry contains the corresponding prediction
|
||||
*/
|
||||
def predict(testData: spark.RDD[Array[Double]]): RDD[Double] = {
|
||||
// A small optimization to avoid serializing the entire model. Only the weightsMatrix
|
||||
// and intercept is needed.
|
||||
|
@ -55,6 +64,12 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Predict values for a single data point using the model trained.
|
||||
*
|
||||
* @param testData array representing a single data point
|
||||
* @return Double prediction from the trained model
|
||||
*/
|
||||
def predict(testData: Array[Double]): Double = {
|
||||
val dataMat = new DoubleMatrix(1, testData.length, testData:_*)
|
||||
predictPoint(dataMat, weightsMatrix, intercept)
|
||||
|
@ -62,7 +77,7 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
|
|||
}
|
||||
|
||||
/**
|
||||
* GeneralizedLinearAlgorithm abstracts out the training for all GLMs.
|
||||
* GeneralizedLinearAlgorithm implements methods to train a Genearalized Linear Model (GLM).
|
||||
* This class should be extended with an Optimizer to create a new GLM.
|
||||
*/
|
||||
abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
|
||||
|
@ -70,9 +85,12 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
|
|||
|
||||
val optimizer: Optimizer
|
||||
|
||||
def createModel(weights: Array[Double], intercept: Double): M
|
||||
/**
|
||||
* Create a model given the weights and intercept
|
||||
*/
|
||||
protected def createModel(weights: Array[Double], intercept: Double): M
|
||||
|
||||
var addIntercept: Boolean
|
||||
protected var addIntercept: Boolean
|
||||
|
||||
/**
|
||||
* Set if the algorithm should add an intercept. Default true.
|
||||
|
@ -82,12 +100,20 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
|
|||
this
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the algorithm with the configured parameters on an input
|
||||
* RDD of LabeledPoint entries.
|
||||
*/
|
||||
def run(input: RDD[LabeledPoint]) : M = {
|
||||
val nfeatures: Int = input.first().features.length
|
||||
val initialWeights = Array.fill(nfeatures)(1.0)
|
||||
run(input, initialWeights)
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the algorithm with the configured parameters on an input RDD
|
||||
* of LabeledPoint entries starting from the initial weights provided.
|
||||
*/
|
||||
def run(input: RDD[LabeledPoint], initialWeights: Array[Double]) : M = {
|
||||
|
||||
// Add a extra variable consisting of all 1.0's for the intercept.
|
||||
|
|
|
@ -24,8 +24,10 @@ import spark.mllib.util.MLUtils
|
|||
import org.jblas.DoubleMatrix
|
||||
|
||||
/**
|
||||
* Lasso using Stochastic Gradient Descent.
|
||||
* Regression model trained using Lasso.
|
||||
*
|
||||
* @param weights Weights computed for every feature.
|
||||
* @param intercept Intercept computed for this model.
|
||||
*/
|
||||
class LassoModel(
|
||||
override val weights: Array[Double],
|
||||
|
@ -39,8 +41,10 @@ class LassoModel(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
class LassoWithSGD (
|
||||
/**
|
||||
* Train a regression model with L1-regularization using Stochastic Gradient Descent.
|
||||
*/
|
||||
class LassoWithSGD private (
|
||||
var stepSize: Double,
|
||||
var numIterations: Int,
|
||||
var regParam: Double,
|
||||
|
|
|
@ -168,10 +168,10 @@ class RidgeRegression private (var lambdaLow: Double, var lambdaHigh: Double)
|
|||
|
||||
/**
|
||||
* Top-level methods for calling Ridge Regression.
|
||||
* NOTE(shivaram): We use multiple train methods instead of default arguments to support
|
||||
* Java programs.
|
||||
*/
|
||||
object RidgeRegression {
|
||||
// NOTE(shivaram): We use multiple train methods instead of default arguments to support
|
||||
// Java programs.
|
||||
|
||||
/**
|
||||
* Train a ridge regression model given an RDD of (response, features) pairs.
|
||||
|
|
|
@ -21,12 +21,16 @@ import scala.util.Random
|
|||
|
||||
import spark.{RDD, SparkContext}
|
||||
|
||||
/**
|
||||
* Generate test data for KMeans. This class first chooses k cluster centers
|
||||
* from a d-dimensional Gaussian distribution scaled by factor r and then creates a Gaussian
|
||||
* cluster with scale 1 around each center.
|
||||
*/
|
||||
|
||||
object KMeansDataGenerator {
|
||||
|
||||
/**
|
||||
* Generate an RDD containing test data for KMeans. This function chooses k cluster centers
|
||||
* from a d-dimensional Gaussian distribution scaled by factor r, then creates a Gaussian
|
||||
* cluster with scale 1 around each center.
|
||||
* Generate an RDD containing test data for KMeans.
|
||||
*
|
||||
* @param sc SparkContext to use for creating the RDD
|
||||
* @param numPoints Number of points that will be contained in the RDD
|
||||
|
|
|
@ -1,18 +1,22 @@
|
|||
package spark.mllib.regression
|
||||
package spark.mllib.util
|
||||
|
||||
import scala.util.Random
|
||||
|
||||
import org.jblas.DoubleMatrix
|
||||
|
||||
import spark.{RDD, SparkContext}
|
||||
import spark.mllib.util.MLUtils
|
||||
import spark.mllib.regression.LabeledPoint
|
||||
|
||||
object LassoGenerator {
|
||||
/**
|
||||
* Generate sample data used for Lasso Regression. This class generates uniform random values
|
||||
* for the features and adds Gaussian noise with weight 0.1 to generate response variables.
|
||||
*/
|
||||
object LassoDataGenerator {
|
||||
|
||||
def main(args: Array[String]) {
|
||||
if (args.length != 5) {
|
||||
if (args.length < 2) {
|
||||
println("Usage: LassoGenerator " +
|
||||
"<master> <output_dir> <num_examples> <num_features> <num_partitions>")
|
||||
"<master> <output_dir> [num_examples] [num_features] [num_partitions]")
|
||||
System.exit(1)
|
||||
}
|
||||
|
||||
|
@ -21,7 +25,6 @@ object LassoGenerator {
|
|||
val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
|
||||
val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
|
||||
val parts: Int = if (args.length > 4) args(4).toInt else 2
|
||||
val eps = 3
|
||||
|
||||
val sc = new SparkContext(sparkMaster, "LassoGenerator")
|
||||
|
||||
|
|
|
@ -22,11 +22,15 @@ import scala.util.Random
|
|||
import spark.{RDD, SparkContext}
|
||||
import spark.mllib.regression.LabeledPoint
|
||||
|
||||
/**
|
||||
* Generate test data for LogisticRegression. This class chooses positive labels
|
||||
* with probability `probOne` and scales features for positive examples by `eps`.
|
||||
*/
|
||||
|
||||
object LogisticRegressionDataGenerator {
|
||||
|
||||
/**
|
||||
* Generate an RDD containing test data for LogisticRegression. This function chooses
|
||||
* positive labels with probability `probOne` and scales positive examples by `eps`.
|
||||
* Generate an RDD containing test data for LogisticRegression.
|
||||
*
|
||||
* @param sc SparkContext to use for creating the RDD.
|
||||
* @param nexamples Number of examples that will be contained in the RDD.
|
||||
|
|
|
@ -24,18 +24,19 @@ import org.jblas.DoubleMatrix
|
|||
import spark.mllib.regression.LabeledPoint
|
||||
|
||||
/**
|
||||
* Helper methods to load and save data
|
||||
* Data format:
|
||||
* <l>, <f1> <f2> ...
|
||||
* where <f1>, <f2> are feature values in Double and <l> is the corresponding label as Double.
|
||||
* Helper methods to load, save and pre-process data used in ML Lib.
|
||||
*/
|
||||
object MLUtils {
|
||||
|
||||
/**
|
||||
* Load labeled data from a file. The data format used here is
|
||||
* <L>, <f1> <f2> ...
|
||||
* where <f1>, <f2> are feature values in Double and <L> is the corresponding label as Double.
|
||||
*
|
||||
* @param sc SparkContext
|
||||
* @param dir Directory to the input data files.
|
||||
* @return An RDD of tuples. For each tuple, the first element is the label, and the second
|
||||
* element represents the feature values (an array of Double).
|
||||
* @return An RDD of LabeledPoint. Each labeled point has two elements: the first element is
|
||||
* the label, and the second element represents the feature values (an array of Double).
|
||||
*/
|
||||
def loadLabeledData(sc: SparkContext, dir: String): RDD[LabeledPoint] = {
|
||||
sc.textFile(dir).map { line =>
|
||||
|
@ -46,6 +47,14 @@ object MLUtils {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save labeled data to a file. The data format used here is
|
||||
* <L>, <f1> <f2> ...
|
||||
* where <f1>, <f2> are feature values in Double and <L> is the corresponding label as Double.
|
||||
*
|
||||
* @param data An RDD of LabeledPoints containing data to be saved.
|
||||
* @param dir Directory to save the data.
|
||||
*/
|
||||
def saveLabeledData(data: RDD[LabeledPoint], dir: String) {
|
||||
val dataStr = data.map(x => x.label + "," + x.features.mkString(" "))
|
||||
dataStr.saveAsTextFile(dir)
|
||||
|
|
|
@ -24,18 +24,24 @@ import org.jblas.DoubleMatrix
|
|||
import spark.{RDD, SparkContext}
|
||||
import spark.mllib.regression.LabeledPoint
|
||||
|
||||
/**
|
||||
* Generate sample data used for RidgeRegression. This class generates
|
||||
* uniformly random values for every feature and adds Gaussian noise with mean `eps` to the
|
||||
* response variable `Y`.
|
||||
*
|
||||
*/
|
||||
object RidgeRegressionDataGenerator {
|
||||
|
||||
/**
|
||||
* Generate an RDD containing test data used for RidgeRegression. This function generates
|
||||
* uniformly random values for every feature and adds Gaussian noise with mean `eps` to the
|
||||
* response variable `Y`.
|
||||
* Generate an RDD containing sample data for RidgeRegression.
|
||||
*
|
||||
* @param sc SparkContext to be used for generating the RDD.
|
||||
* @param nexamples Number of examples that will be contained in the RDD.
|
||||
* @param nfeatures Number of features to generate for each example.
|
||||
* @param eps Epsilon factor by which examples are scaled.
|
||||
* @param nparts Number of partitions in the RDD. Default value is 2.
|
||||
*
|
||||
* @return RDD of LabeledPoint containing sample data.
|
||||
*/
|
||||
def generateRidgeRDD(
|
||||
sc: SparkContext,
|
||||
|
@ -69,9 +75,9 @@ object RidgeRegressionDataGenerator {
|
|||
}
|
||||
|
||||
def main(args: Array[String]) {
|
||||
if (args.length != 5) {
|
||||
if (args.length < 2) {
|
||||
println("Usage: RidgeRegressionGenerator " +
|
||||
"<master> <output_dir> <num_examples> <num_features> <num_partitions>")
|
||||
"<master> <output_dir> [num_examples] [num_features] [num_partitions]")
|
||||
System.exit(1)
|
||||
}
|
||||
|
||||
|
|
|
@ -1,22 +1,23 @@
|
|||
package spark.mllib.classification
|
||||
package spark.mllib.util
|
||||
|
||||
import scala.util.Random
|
||||
import scala.math.signum
|
||||
|
||||
import org.jblas.DoubleMatrix
|
||||
|
||||
import spark.{RDD, SparkContext}
|
||||
import spark.mllib.util.MLUtils
|
||||
|
||||
import org.jblas.DoubleMatrix
|
||||
import spark.mllib.regression.LabeledPoint
|
||||
|
||||
object SVMGenerator {
|
||||
/**
|
||||
* Generate sample data used for SVM. This class generates uniform random values
|
||||
* for the features and adds Gaussian noise with weight 0.1 to generate labels.
|
||||
*/
|
||||
object SVMDataGenerator {
|
||||
|
||||
def main(args: Array[String]) {
|
||||
if (args.length != 5) {
|
||||
if (args.length < 2) {
|
||||
println("Usage: SVMGenerator " +
|
||||
"<master> <output_dir> <num_examples> <num_features> <num_partitions>")
|
||||
"<master> <output_dir> [num_examples] [num_features] [num_partitions]")
|
||||
System.exit(1)
|
||||
}
|
||||
|
||||
|
@ -25,7 +26,6 @@ object SVMGenerator {
|
|||
val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
|
||||
val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
|
||||
val parts: Int = if (args.length > 4) args(4).toInt else 2
|
||||
val eps = 3
|
||||
|
||||
val sc = new SparkContext(sparkMaster, "SVMGenerator")
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче