Clean up scaladoc in ML Lib.

Also build and copy ML Lib scaladoc in Spark docs build. Some more minor cleanup with respect to naming, test locations etc.
2013-08-11 19:02:43 -07:00 · 2013-08-11 19:02:43 -07:00 · 4935a2558b
--- a/docs/_plugins/copy_api_dirs.rb
+++ b/docs/_plugins/copy_api_dirs.rb
@ -20,7 +20,7 @@ include FileUtils
 if ENV['SKIP_API'] != '1'
  # Build Scaladoc for Java/Scala
-  projects = ["core", "examples", "repl", "bagel", "streaming"]
+  projects = ["core", "examples", "repl", "bagel", "streaming", "mllib"]
  puts "Moving to project root and building scaladoc."
  curr_dir = pwd
--- a/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/spark/mllib/classification/LogisticRegression.scala
@ -27,8 +27,10 @@ import scala.math.round
 import org.jblas.DoubleMatrix
 /**
- * Logistic Regression using Stochastic Gradient Descent.
+ * Classification model trained using Logistic Regression.
- * Based on Matlab code written by John Duchi.
+ *
 * @param weights Weights computed for every feature.
 * @param intercept Intercept computed for this model.
 */
 class LogisticRegressionModel(
    override val weights: Array[Double],
@ -43,7 +45,10 @@ class LogisticRegressionModel(
  }
 }
-class LogisticRegressionWithSGD (
+/**
 * Train a classification model for Logistic Regression using Stochastic Gradient Descent.
 */
 class LogisticRegressionWithSGD private (
    var stepSize: Double,
    var numIterations: Int,
    var regParam: Double,
@ -70,10 +75,10 @@ class LogisticRegressionWithSGD (
 /**
 * Top-level methods for calling Logistic Regression.
 * NOTE(shivaram): We use multiple train methods instead of default arguments to support
 *                 Java programs.
 */
 object LogisticRegressionWithSGD {
  // NOTE(shivaram): We use multiple train methods instead of default arguments to support
  // Java programs.
  /**
   * Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
--- a/mllib/src/main/scala/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/spark/mllib/classification/SVM.scala
@ -26,7 +26,10 @@ import spark.mllib.util.MLUtils
 import org.jblas.DoubleMatrix
 /**
- * SVM using Stochastic Gradient Descent.
+ * Model built using SVM.
 *
 * @param weights Weights computed for every feature.
 * @param intercept Intercept computed for this model.
 */
 class SVMModel(
    override val weights: Array[Double],
@ -40,6 +43,9 @@ class SVMModel(
  }
 }
 /**
 * Train an SVM using Stochastic Gradient Descent.
 */
 class SVMWithSGD private (
    var stepSize: Double,
    var numIterations: Int,
--- a/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala
+++ b/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala
@ -19,18 +19,29 @@ package spark.mllib.optimization
 import org.jblas.DoubleMatrix
 /**
 * Class used to compute the gradient for a loss function, given a single data point.
 */
 abstract class Gradient extends Serializable {
  /**
-   * Compute the gradient for a given row of data.
+   * Compute the gradient and loss given features of a single data point.
   *
-   * @param data - One row of data. Row matrix of size 1xn where n is the number of features.
+   * @param data - Feature values for one data point. Column matrix of size nx1
   *               where n is the number of features.
   * @param label - Label for this data item.
   * @param weights - Column matrix containing weights for every feature.
   *
   * @return A tuple of 2 elements. The first element is a column matrix containing the computed
   *         gradient and the second element is the loss computed at this data point.
   *
   */
  def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix): 
      (DoubleMatrix, Double)
 }
 /**
 * Compute gradient and loss for a logistic loss function.
 */
 class LogisticGradient extends Gradient {
  override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix): 
      (DoubleMatrix, Double) = {
@ -49,7 +60,9 @@ class LogisticGradient extends Gradient {
  }
 }
-
+/**
 * Compute gradient and loss for a Least-squared loss function.
 */
 class SquaredGradient extends Gradient {
  override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix): 
      (DoubleMatrix, Double) = {
@ -62,7 +75,9 @@ class SquaredGradient extends Gradient {
  }
 }
-
+/**
 * Compute gradient and loss for a Hinge loss function.
 */
 class HingeGradient extends Gradient {
  override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix): 
      (DoubleMatrix, Double) = {
--- a/mllib/src/main/scala/spark/mllib/optimization/GradientDescent.scala
+++ b/mllib/src/main/scala/spark/mllib/optimization/GradientDescent.scala
@ -24,12 +24,17 @@ import org.jblas.DoubleMatrix
 import scala.collection.mutable.ArrayBuffer
 /**
 * Class used to solve an optimization problem using Gradient Descent.
 * @param gradient Gradient function to be used.
 * @param updater Updater to be used to update weights after every iteration.
 */
 class GradientDescent(var gradient: Gradient, var updater: Updater) extends Optimizer {
-  var stepSize: Double = 1.0
+  private var stepSize: Double = 1.0
-  var numIterations: Int = 100
+  private var numIterations: Int = 100
-  var regParam: Double = 0.0
+  private var regParam: Double = 0.0
-  var miniBatchFraction: Double = 1.0
+  private var miniBatchFraction: Double = 1.0
  /**
   * Set the step size per-iteration of SGD. Default 1.0.
@ -97,10 +102,10 @@ class GradientDescent(var gradient: Gradient, var updater: Updater) extends Opti
 }
 // Top-level method to run gradient descent.
 object GradientDescent extends Logging {
  /**
   * Run gradient descent in parallel using mini batches.
   * Based on Matlab code written by John Duchi.
   *
   * @param data - Input data for SGD. RDD of form (label, [feature values]).
   * @param gradient - Gradient object that will be used to compute the gradient.
@ -137,8 +142,8 @@ object GradientDescent extends Logging {
    for (i <- 1 to numIterations) {
      val (gradientSum, lossSum) = data.sample(false, miniBatchFraction, 42+i).map {
        case (y, features) =>
-          val featuresRow = new DoubleMatrix(features.length, 1, features:_*)
+          val featuresCol = new DoubleMatrix(features.length, 1, features:_*)
-          val (grad, loss) = gradient.compute(featuresRow, y, weights)
+          val (grad, loss) = gradient.compute(featuresCol, y, weights)
          (grad, loss)
      }.reduce((a, b) => (a._1.addi(b._1), a._2 + b._2))
--- a/mllib/src/main/scala/spark/mllib/optimization/Updater.scala
+++ b/mllib/src/main/scala/spark/mllib/optimization/Updater.scala
@ -20,10 +20,14 @@ package spark.mllib.optimization
 import scala.math._
 import org.jblas.DoubleMatrix
 /**
 * Class used to update weights used in Gradient Descent.
 */
 abstract class Updater extends Serializable {
  /**
-   * Compute an updated value for weights given the gradient, stepSize and iteration number.
+   * Compute an updated value for weights given the gradient, stepSize, iteration number and
-   * Also returns the regularization value computed using the *updated* weights.
+   * regularization parameter. Also returns the regularization value computed using the
   * *updated* weights.
   *
   * @param weightsOld - Column matrix of size nx1 where n is the number of features.
   * @param gradient - Column matrix of size nx1 where n is the number of features.
@ -38,6 +42,10 @@ abstract class Updater extends Serializable {
      regParam: Double): (DoubleMatrix, Double)
 }
 /**
 * A simple updater that adaptively adjusts the learning rate the
 * square root of the number of iterations. Does not perform any regularization.
 */
 class SimpleUpdater extends Updater {
  override def compute(weightsOld: DoubleMatrix, gradient: DoubleMatrix,
      stepSize: Double, iter: Int, regParam: Double): (DoubleMatrix, Double) = {
@ -48,11 +56,15 @@ class SimpleUpdater extends Updater {
 }
 /**
- * L1 regularization -- corresponding proximal operator is the soft-thresholding function
+ * Updater that adjusts learning rate and performs L1 regularization.
- * That is, each weight component is shrunk towards 0 by shrinkageVal
+ *
 * The corresponding proximal operator used is the soft-thresholding function.
 * That is, each weight component is shrunk towards 0 by shrinkageVal.
 *
 * If w >  shrinkageVal, set weight component to w-shrinkageVal.
 * If w < -shrinkageVal, set weight component to w+shrinkageVal.
 * If -shrinkageVal < w < shrinkageVal, set weight component to 0.
 *
 * Equivalently, set weight component to signum(w) * max(0.0, abs(w) - shrinkageVal)
 */
 class L1Updater extends Updater {
@ -72,6 +84,9 @@ class L1Updater extends Updater {
  }
 }
 /**
 * Updater that adjusts the learning rate and performs L2 regularization
 */
 class SquaredL2Updater extends Updater {
  override def compute(weightsOld: DoubleMatrix, gradient: DoubleMatrix,
      stepSize: Double, iter: Int, regParam: Double): (DoubleMatrix, Double) = {
--- a/mllib/src/main/scala/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/spark/mllib/recommendation/MatrixFactorizationModel.scala
@ -22,6 +22,15 @@ import spark.SparkContext._
 import org.jblas._
 /**
 * Model representing the result of matrix factorization.
 *
 * @param rank Rank for the features in this model.
 * @param userFeatures RDD of tuples where each tuple represents the userId and
 *                     the features computed for this user.
 * @param productFeatures RDD of tuples where each tuple represents the productId
 *                        and the features computed for this product.
 */
 class MatrixFactorizationModel(
    val rank: Int,
    val userFeatures: RDD[(Int, Array[Double])],
--- a/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ b/mllib/src/main/scala/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@ -24,8 +24,11 @@ import org.jblas.DoubleMatrix
 /**
 * GeneralizedLinearModel (GLM) represents a model trained using 
- * GeneralizedLinearAlgorithm. GLMs consist of a weight vector,
+ * GeneralizedLinearAlgorithm. GLMs consist of a weight vector and
 * an intercept.
 *
 * @param weights Weights computed for every feature.
 * @param intercept Intercept computed for this model.
 */
 abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept: Double)
  extends Serializable {
@ -43,6 +46,12 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
  def predictPoint(dataMatrix: DoubleMatrix, weightMatrix: DoubleMatrix,
    intercept: Double): Double
  /**
   * Predict values for the given data set using the model trained.
   *
   * @param testData RDD representing data points to be predicted
   * @return RDD[Double] where each entry contains the corresponding prediction
   */
  def predict(testData: spark.RDD[Array[Double]]): RDD[Double] = {
    // A small optimization to avoid serializing the entire model. Only the weightsMatrix
    // and intercept is needed.
@ -55,6 +64,12 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
    }
  }
  /**
   * Predict values for a single data point using the model trained.
   *
   * @param testData array representing a single data point
   * @return Double prediction from the trained model
   */
  def predict(testData: Array[Double]): Double = {
    val dataMat = new DoubleMatrix(1, testData.length, testData:_*)
    predictPoint(dataMat, weightsMatrix, intercept)
@ -62,7 +77,7 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
 }
 /**
- * GeneralizedLinearAlgorithm abstracts out the training for all GLMs. 
+ * GeneralizedLinearAlgorithm implements methods to train a Genearalized Linear Model (GLM).
 * This class should be extended with an Optimizer to create a new GLM.
 */
 abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
@ -70,9 +85,12 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
  val optimizer: Optimizer
-  def createModel(weights: Array[Double], intercept: Double): M
+  /**
   * Create a model given the weights and intercept
   */
  protected def createModel(weights: Array[Double], intercept: Double): M
-  var addIntercept: Boolean
+  protected var addIntercept: Boolean
  /**
   * Set if the algorithm should add an intercept. Default true.
@ -82,12 +100,20 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
    this
  }
  /**
   * Run the algorithm with the configured parameters on an input
   * RDD of LabeledPoint entries.
   */
  def run(input: RDD[LabeledPoint]) : M = {
    val nfeatures: Int = input.first().features.length
    val initialWeights = Array.fill(nfeatures)(1.0)
    run(input, initialWeights)
  }
  /**
   * Run the algorithm with the configured parameters on an input RDD
   * of LabeledPoint entries starting from the initial weights provided.
   */
  def run(input: RDD[LabeledPoint], initialWeights: Array[Double]) : M = {
    // Add a extra variable consisting of all 1.0's for the intercept.
--- a/mllib/src/main/scala/spark/mllib/regression/Lasso.scala
+++ b/mllib/src/main/scala/spark/mllib/regression/Lasso.scala
@ -24,8 +24,10 @@ import spark.mllib.util.MLUtils
 import org.jblas.DoubleMatrix
 /**
- * Lasso using Stochastic Gradient Descent.
+ * Regression model trained using Lasso.
 *
 * @param weights Weights computed for every feature.
 * @param intercept Intercept computed for this model.
 */
 class LassoModel(
    override val weights: Array[Double],
@ -39,8 +41,10 @@ class LassoModel(
  }
 }
-
+/**
-class LassoWithSGD (
+ * Train a regression model with L1-regularization using Stochastic Gradient Descent.
 */
 class LassoWithSGD private (
    var stepSize: Double,
    var numIterations: Int,
    var regParam: Double,
--- a/mllib/src/main/scala/spark/mllib/regression/RidgeRegression.scala
+++ b/mllib/src/main/scala/spark/mllib/regression/RidgeRegression.scala
@ -168,10 +168,10 @@ class RidgeRegression private (var lambdaLow: Double, var lambdaHigh: Double)
 /**
 * Top-level methods for calling Ridge Regression.
 * NOTE(shivaram): We use multiple train methods instead of default arguments to support 
 *                 Java programs.
 */
 object RidgeRegression {
  // NOTE(shivaram): We use multiple train methods instead of default arguments to support
  // Java programs.
  /**
   * Train a ridge regression model given an RDD of (response, features) pairs.
--- a/mllib/src/main/scala/spark/mllib/util/KMeansDataGenerator.scala
+++ b/mllib/src/main/scala/spark/mllib/util/KMeansDataGenerator.scala
@ -21,12 +21,16 @@ import scala.util.Random
 import spark.{RDD, SparkContext}
 /**
 * Generate test data for KMeans. This class first chooses k cluster centers
 * from a d-dimensional Gaussian distribution scaled by factor r and then creates a Gaussian
 * cluster with scale 1 around each center.
 */
 object KMeansDataGenerator {
  /**
-   * Generate an RDD containing test data for KMeans. This function chooses k cluster centers
+   * Generate an RDD containing test data for KMeans.
   * from a d-dimensional Gaussian distribution scaled by factor r, then creates a Gaussian
   * cluster with scale 1 around each center.
   *
   * @param sc SparkContext to use for creating the RDD
   * @param numPoints Number of points that will be contained in the RDD
--- a/mllib/src/main/scala/spark/mllib/util/LassoDataGenerator.scala
+++ b/mllib/src/main/scala/spark/mllib/util/LassoDataGenerator.scala
@ -1,18 +1,22 @@
-package spark.mllib.regression
+package spark.mllib.util
 import scala.util.Random
 import org.jblas.DoubleMatrix
 import spark.{RDD, SparkContext}
-import spark.mllib.util.MLUtils
+import spark.mllib.regression.LabeledPoint
-object LassoGenerator {
+/**
 * Generate sample data used for Lasso Regression. This class generates uniform random values
 * for the features and adds Gaussian noise with weight 0.1 to generate response variables.
 */
 object LassoDataGenerator {
  def main(args: Array[String]) {
-    if (args.length != 5) {
+    if (args.length < 2) {
      println("Usage: LassoGenerator " +
-        "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
+        "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
      System.exit(1)
    }
@ -21,7 +25,6 @@ object LassoGenerator {
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2
    val eps = 3
    val sc = new SparkContext(sparkMaster, "LassoGenerator")
--- a/mllib/src/main/scala/spark/mllib/util/LogisticRegressionDataGenerator.scala
+++ b/mllib/src/main/scala/spark/mllib/util/LogisticRegressionDataGenerator.scala
@ -22,11 +22,15 @@ import scala.util.Random
 import spark.{RDD, SparkContext}
 import spark.mllib.regression.LabeledPoint
 /**
 * Generate test data for LogisticRegression. This class chooses positive labels
 * with probability `probOne` and scales features for positive examples by `eps`.
 */
 object LogisticRegressionDataGenerator {
  /**
-   * Generate an RDD containing test data for LogisticRegression. This function chooses
+   * Generate an RDD containing test data for LogisticRegression.
   * positive labels with probability `probOne` and scales positive examples by `eps`.
   *
   * @param sc SparkContext to use for creating the RDD.
   * @param nexamples Number of examples that will be contained in the RDD.
--- a/mllib/src/main/scala/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/spark/mllib/util/MLUtils.scala
@ -24,18 +24,19 @@ import org.jblas.DoubleMatrix
 import spark.mllib.regression.LabeledPoint
 /**
- * Helper methods to load and save data
+ * Helper methods to load, save and pre-process data used in ML Lib.
 * Data format:
 * <l>, <f1> <f2> ...
 * where <f1>, <f2> are feature values in Double and <l> is the corresponding label as Double.
 */
 object MLUtils {
  /**
   * Load labeled data from a file. The data format used here is
   * <L>, <f1> <f2> ...
   * where <f1>, <f2> are feature values in Double and <L> is the corresponding label as Double.
   *
   * @param sc SparkContext
   * @param dir Directory to the input data files.
-   * @return An RDD of tuples. For each tuple, the first element is the label, and the second
+   * @return An RDD of LabeledPoint. Each labeled point has two elements: the first element is
-   *         element represents the feature values (an array of Double).
+   *         the label, and the second element represents the feature values (an array of Double).
   */
  def loadLabeledData(sc: SparkContext, dir: String): RDD[LabeledPoint] = {
    sc.textFile(dir).map { line =>
@ -46,6 +47,14 @@ object MLUtils {
    }
  }
  /**
   * Save labeled data to a file. The data format used here is
   * <L>, <f1> <f2> ...
   * where <f1>, <f2> are feature values in Double and <L> is the corresponding label as Double.
   *
   * @param data An RDD of LabeledPoints containing data to be saved.
   * @param dir Directory to save the data.
   */
  def saveLabeledData(data: RDD[LabeledPoint], dir: String) {
    val dataStr = data.map(x => x.label + "," + x.features.mkString(" "))
    dataStr.saveAsTextFile(dir)
--- a/mllib/src/main/scala/spark/mllib/util/RidgeRegressionDataGenerator.scala
+++ b/mllib/src/main/scala/spark/mllib/util/RidgeRegressionDataGenerator.scala
@ -24,18 +24,24 @@ import org.jblas.DoubleMatrix
 import spark.{RDD, SparkContext}
 import spark.mllib.regression.LabeledPoint
 /**
 * Generate sample data used for RidgeRegression. This class generates
 * uniformly random values for every feature and adds Gaussian noise with mean `eps` to the
 * response variable `Y`.
 *
 */
 object RidgeRegressionDataGenerator {
  /**
-   * Generate an RDD containing test data used for RidgeRegression. This function generates
+   * Generate an RDD containing sample data for RidgeRegression.
   * uniformly random values for every feature and adds Gaussian noise with mean `eps` to the
   * response variable `Y`.
   *
   * @param sc SparkContext to be used for generating the RDD.
   * @param nexamples Number of examples that will be contained in the RDD.
   * @param nfeatures Number of features to generate for each example.
   * @param eps Epsilon factor by which examples are scaled.
   * @param nparts Number of partitions in the RDD. Default value is 2.
   *
   * @return RDD of LabeledPoint containing sample data.
   */
  def generateRidgeRDD(
    sc: SparkContext,
@ -69,9 +75,9 @@ object RidgeRegressionDataGenerator {
  }
  def main(args: Array[String]) {
-    if (args.length != 5) {
+    if (args.length < 2) {
      println("Usage: RidgeRegressionGenerator " +
-        "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
+        "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
      System.exit(1)
    }
--- a/mllib/src/main/scala/spark/mllib/util/SVMDataGenerator.scala
+++ b/mllib/src/main/scala/spark/mllib/util/SVMDataGenerator.scala
@ -1,22 +1,23 @@
-package spark.mllib.classification
+package spark.mllib.util
 import scala.util.Random
 import scala.math.signum
 import org.jblas.DoubleMatrix
 import spark.{RDD, SparkContext}
 import spark.mllib.util.MLUtils
 import org.jblas.DoubleMatrix
 import spark.mllib.regression.LabeledPoint
-object SVMGenerator {
+/**
 * Generate sample data used for SVM. This class generates uniform random values
 * for the features and adds Gaussian noise with weight 0.1 to generate labels.
 */
 object SVMDataGenerator {
  def main(args: Array[String]) {
-    if (args.length != 5) {
+    if (args.length < 2) {
      println("Usage: SVMGenerator " +
-        "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
+        "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
      System.exit(1)
    }
@ -25,7 +26,6 @@ object SVMGenerator {
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2
    val eps = 3
    val sc = new SparkContext(sparkMaster, "SVMGenerator")
--- a/mllib/src/test/scala/spark/mllib/clustering/JavaKMeansSuite.java
+++ b/mllib/src/test/scala/spark/mllib/clustering/JavaKMeansSuite.java
--- a/mllib/src/test/scala/spark/mllib/recommendation/JavaALSSuite.java
+++ b/mllib/src/test/scala/spark/mllib/recommendation/JavaALSSuite.java