Merge branch 'master' into dev

# Conflicts: # csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML
2016-03-31 10:43:29 +08:00 · 2016-03-31 10:43:29 +08:00 · 3d9191102c
--- a/csharp/Adapter/Microsoft.Spark.CSharp/Configuration/ConfigurationService.cs
+++ b/csharp/Adapter/Microsoft.Spark.CSharp/Configuration/ConfigurationService.cs
@ -161,7 +161,7 @@ namespace Microsoft.Spark.CSharp.Configuration

        /// <summary>
        /// Configuration mode for debug mode
-        /// This configuration exists only to make SparkCLR development & debugging easier
+        /// This configuration exists only to make SparkCLR development and debugging easier
        /// </summary>
        private class SparkCLRDebugConfiguration : SparkCLRLocalConfiguration
        {
@ -212,12 +212,30 @@ namespace Microsoft.Spark.CSharp.Configuration
        }
    }

+    /// <summary>
+    /// The running mode used by Configuration Service
+    /// </summary>
    public enum RunMode
    {
+        /// <summary>
+        /// Unknown running mode
+        /// </summary>
        UNKNOWN,
-        DEBUG, //not a Spark mode but exists for dev debugging purpose
+        /// <summary>
+        /// Debug mode, not a Spark mode but exists for develop debugging purpose
+        /// </summary>
+        DEBUG,
+        /// <summary>
+        /// Indicates service is running in local
+        /// </summary>
        LOCAL,
+        /// <summary>
+        /// Indicates service is running in cluster
+        /// </summary>
        CLUSTER,
+        /// <summary>
+        /// Indicates service is running in Yarn
+        /// </summary>
        YARN
        //MESOS //not currently supported
    }
--- a/csharp/Adapter/Microsoft.Spark.CSharp/Core/Accumulator.cs
+++ b/csharp/Adapter/Microsoft.Spark.CSharp/Core/Accumulator.cs
@ -38,10 +38,23 @@ namespace Microsoft.Spark.CSharp.Core
        [ThreadStatic] // Thread safe is needed when running in C# worker
        internal static Dictionary<int, Accumulator> threadLocalAccumulatorRegistry = new Dictionary<int, Accumulator>();

+        /// <summary>
+        /// The identity of the accumulator 
+        /// </summary>
        protected int accumulatorId;
-        [NonSerialized] // When deserialized in C# worker, isDriver is false.
+
+        /// <summary>
+        /// Indicates whether the accumulator is on driver side.
+        /// When deserialized on worker side, isDriver is false by default.
+        /// </summary>
+        [NonSerialized]
        protected bool isDriver = false;
    }
+
+    /// <summary>
+    /// A generic version of <see cref="Accumulator"/> where the element type is specified by the driver program.
+    /// </summary>
+    /// <typeparam name="T">The type of element in the accumulator.</typeparam>
    [Serializable]
    public class Accumulator<T> : Accumulator
    {
@ -49,6 +62,11 @@ namespace Microsoft.Spark.CSharp.Core
        internal T value;
        private readonly AccumulatorParam<T> accumulatorParam = new AccumulatorParam<T>();

+        /// <summary>
+        /// Initializes a new instance of the Accumulator class with a specified identity and a value.
+        /// </summary>
+        /// <param name="accumulatorId">The Identity of the accumulator</param>
+        /// <param name="value">The value of the accumulator</param>
        public Accumulator(int accumulatorId, T value)
        {
            this.accumulatorId = accumulatorId;
@ -57,6 +75,10 @@ namespace Microsoft.Spark.CSharp.Core
            accumulatorRegistry[accumulatorId] = this;
        }

+        /// <summary>
+        /// Gets or sets the value of the accumulator; only usable in driver program
+        /// </summary>
+        /// <exception cref="ArgumentException"></exception>
        public T Value
        {
            // Get the accumulator's value; only usable in driver program
@ -115,6 +137,10 @@ namespace Microsoft.Spark.CSharp.Core
            return self;
        }

+        /// <summary>
+        /// Creates and returns a string representation of the current accumulator
+        /// </summary>
+        /// <returns>A string representation of the current accumulator</returns>
        public override string ToString()
        {
            return string.Format("Accumulator<id={0}, value={1}>", accumulatorId, value);
--- a/csharp/Adapter/Microsoft.Spark.CSharp/Core/Broadcast.cs
+++ b/csharp/Adapter/Microsoft.Spark.CSharp/Core/Broadcast.cs
@ -31,6 +31,9 @@ namespace Microsoft.Spark.CSharp.Core
    [Serializable]
    public class Broadcast
    {
+        /// <summary>
+        /// A thread-safe static collection that is used to store registered broadcast objects.
+        /// </summary>
        [NonSerialized]
        public static ConcurrentDictionary<long, Broadcast> broadcastRegistry = new ConcurrentDictionary<long, Broadcast>();
        [NonSerialized]
@ -38,6 +41,11 @@ namespace Microsoft.Spark.CSharp.Core

        internal long broadcastId;
        internal Broadcast() { }
+
+        /// <summary>
+        /// Initializes a new instance of Broadcast class with a specified path.
+        /// </summary>
+        /// <param name="path">The path that to be set.</param>
        public Broadcast(string path)
        {
            this.path = path;
@ -60,6 +68,11 @@ namespace Microsoft.Spark.CSharp.Core
            }
        }
    }
+
+    /// <summary>
+    /// A generic version of <see cref="Broadcast"/> where the element can be specified.
+    /// </summary>
+    /// <typeparam name="T">The type of element in Broadcast</typeparam>
    [Serializable]
    public class Broadcast<T> : Broadcast
    {
--- a/csharp/Adapter/Microsoft.Spark.CSharp/Core/DoubleRDDFunctions.cs
+++ b/csharp/Adapter/Microsoft.Spark.CSharp/Core/DoubleRDDFunctions.cs
@ -9,6 +9,9 @@ using System.Threading.Tasks;

 namespace Microsoft.Spark.CSharp.Core
 {
+    /// <summary>
+    /// Extra functions available on RDDs of Doubles through an implicit conversion. 
+    /// </summary>
    public static class DoubleRDDFunctions
    {
        /// <summary>
--- a/csharp/Adapter/Microsoft.Spark.CSharp/Core/Option.cs
+++ b/csharp/Adapter/Microsoft.Spark.CSharp/Core/Option.cs
@ -16,17 +16,32 @@ namespace Microsoft.Spark.CSharp.Core
        private bool isDefined = false;
        private T value;

+        /// <summary>
+        /// Initialize a instance of Option class without any value.
+        /// </summary>
        public Option()
        { }

+        /// <summary>
+        /// Initializes a instance of Option class with a specific value. 
+        /// </summary>
+        /// <param name="value">The value to be associated with the new instance.</param>
        public Option(T value)
        {
            isDefined = true;
            this.value = value;
        }

+        /// <summary>
+        /// Indicates whether the option value is defined.
+        /// </summary>
        public bool IsDefined { get { return isDefined; } }

+        /// <summary>
+        /// Returns the value of the option if Option.IsDefined is TRUE;
+        /// otherwise, throws an <see cref="ArgumentException"/>.
+        /// </summary>
+        /// <returns></returns>
        public T GetValue()
        {
            if (isDefined) return value;
--- a/csharp/Adapter/Microsoft.Spark.CSharp/Core/OrderedRDDFunctions.cs
+++ b/csharp/Adapter/Microsoft.Spark.CSharp/Core/OrderedRDDFunctions.cs
@ -10,6 +10,10 @@ using System.Threading.Tasks;

 namespace Microsoft.Spark.CSharp.Core
 {
+    /// <summary>
+    /// Extra functions available on RDDs of (key, value) pairs where the key is sortable through
+    /// a function to sort the key.
+    /// </summary>
    public static class OrderedRDDFunctions
    {

--- a/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML
+++ b/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML
@ -50,12 +50,47 @@
            overridden here
            </summary>
        </member>
-        <!-- Badly formed XML comment ignored for member "T:Microsoft.Spark.CSharp.Configuration.ConfigurationService.SparkCLRDebugConfiguration" -->
+        <member name="T:Microsoft.Spark.CSharp.Configuration.ConfigurationService.SparkCLRDebugConfiguration">
+            <summary>
+            Configuration mode for debug mode
+            This configuration exists only to make SparkCLR development and debugging easier
+            </summary>
+        </member>
        <member name="M:Microsoft.Spark.CSharp.Configuration.ConfigurationService.SparkCLRDebugConfiguration.GetCSharpWorkerExePath">
            <summary>
            The full path of the CSharp external backend worker process.
            </summary>
        </member>
+        <member name="T:Microsoft.Spark.CSharp.Configuration.RunMode">
+            <summary>
+            The running mode used by Configuration Service
+            </summary>
+        </member>
+        <member name="F:Microsoft.Spark.CSharp.Configuration.RunMode.UNKNOWN">
+            <summary>
+            Unknown running mode
+            </summary>
+        </member>
+        <member name="F:Microsoft.Spark.CSharp.Configuration.RunMode.DEBUG">
+            <summary>
+            Debug mode, not a Spark mode but exists for develop debugging purpose
+            </summary>
+        </member>
+        <member name="F:Microsoft.Spark.CSharp.Configuration.RunMode.LOCAL">
+            <summary>
+            Indicates service is running in local
+            </summary>
+        </member>
+        <member name="F:Microsoft.Spark.CSharp.Configuration.RunMode.CLUSTER">
+            <summary>
+            Indicates service is running in cluster
+            </summary>
+        </member>
+        <member name="F:Microsoft.Spark.CSharp.Configuration.RunMode.YARN">
+            <summary>
+            Indicates service is running in Yarn
+            </summary>
+        </member>
        <member name="T:Microsoft.Spark.CSharp.Core.Accumulator">
            <summary>
            A shared variable that can be accumulated, i.e., has a commutative and associative "add"
@ -71,6 +106,30 @@
            
            </summary>
        </member>
+        <member name="F:Microsoft.Spark.CSharp.Core.Accumulator.accumulatorId">
+            <summary>
+            The identity of the accumulator 
+            </summary>
+        </member>
+        <member name="F:Microsoft.Spark.CSharp.Core.Accumulator.isDriver">
+            <summary>
+            Indicates whether the accumulator is on driver side.
+            When deserialized on worker side, isDriver is false by default.
+            </summary>
+        </member>
+        <member name="T:Microsoft.Spark.CSharp.Core.Accumulator`1">
+            <summary>
+            A generic version of <see cref="T:Microsoft.Spark.CSharp.Core.Accumulator"/> where the element type is specified by the driver program.
+            </summary>
+            <typeparam name="T">The type of element in the accumulator.</typeparam>
+        </member>
+        <member name="M:Microsoft.Spark.CSharp.Core.Accumulator`1.#ctor(System.Int32,`0)">
+            <summary>
+            Initializes a new instance of the Accumulator class with a specified identity and a value.
+            </summary>
+            <param name="accumulatorId">The Identity of the accumulator</param>
+            <param name="value">The value of the accumulator</param>
+        </member>
        <member name="M:Microsoft.Spark.CSharp.Core.Accumulator`1.Add(`0)">
            <summary>
            Adds a term to this accumulator's value
@ -86,6 +145,18 @@
            <param name="term"></param>
            <returns></returns>
        </member>
+        <member name="M:Microsoft.Spark.CSharp.Core.Accumulator`1.ToString">
+            <summary>
+            Creates and returns a string representation of the current accumulator
+            </summary>
+            <returns>A string representation of the current accumulator</returns>
+        </member>
+        <member name="P:Microsoft.Spark.CSharp.Core.Accumulator`1.Value">
+            <summary>
+            Gets or sets the value of the accumulator; only usable in driver program
+            </summary>
+            <exception cref="T:System.ArgumentException"></exception>
+        </member>
        <member name="T:Microsoft.Spark.CSharp.Core.AccumulatorParam`1">
            <summary>
            An AccumulatorParam that uses the + operators to add values. Designed for simple types
@ -131,6 +202,23 @@
            
            </summary>
        </member>
+        <member name="F:Microsoft.Spark.CSharp.Core.Broadcast.broadcastRegistry">
+            <summary>
+            A thread-safe static collection that is used to store registered broadcast objects.
+            </summary>
+        </member>
+        <member name="M:Microsoft.Spark.CSharp.Core.Broadcast.#ctor(System.String)">
+            <summary>
+            Initializes a new instance of Broadcast class with a specified path.
+            </summary>
+            <param name="path">The path that to be set.</param>
+        </member>
+        <member name="T:Microsoft.Spark.CSharp.Core.Broadcast`1">
+            <summary>
+            A generic version of <see cref="T:Microsoft.Spark.CSharp.Core.Broadcast"/> where the element can be specified.
+            </summary>
+            <typeparam name="T">The type of element in Broadcast</typeparam>
+        </member>
        <member name="M:Microsoft.Spark.CSharp.Core.Broadcast`1.Unpersist(System.Boolean)">
            <summary>
            Delete cached copies of this broadcast on the executors.
@ -149,6 +237,29 @@
            </summary>
            <typeparam name="T"></typeparam>
        </member>
+        <member name="M:Microsoft.Spark.CSharp.Core.Option`1.#ctor">
+            <summary>
+            Initialize a instance of Option class without any value.
+            </summary>
+        </member>
+        <member name="M:Microsoft.Spark.CSharp.Core.Option`1.#ctor(`0)">
+            <summary>
+            Initializes a instance of Option class with a specific value. 
+            </summary>
+            <param name="value">The value to be associated with the new instance.</param>
+        </member>
+        <member name="M:Microsoft.Spark.CSharp.Core.Option`1.GetValue">
+            <summary>
+            Returns the value of the option if Option.IsDefined is TRUE;
+            otherwise, throws an <see cref="T:System.ArgumentException"/>.
+            </summary>
+            <returns></returns>
+        </member>
+        <member name="P:Microsoft.Spark.CSharp.Core.Option`1.IsDefined">
+            <summary>
+            Indicates whether the option value is defined.
+            </summary>
+        </member>
        <member name="T:Microsoft.Spark.CSharp.Core.Partitioner">
            <summary>
            An object that defines how the elements in a key-value pair RDD are partitioned by key.
@ -173,6 +284,11 @@
            Interface for collect operation on RDD
            </summary>
        </member>
+        <member name="T:Microsoft.Spark.CSharp.Core.DoubleRDDFunctions">
+            <summary>
+            Extra functions available on RDDs of Doubles through an implicit conversion. 
+            </summary>
+        </member>
        <member name="M:Microsoft.Spark.CSharp.Core.DoubleRDDFunctions.Sum(Microsoft.Spark.CSharp.Core.RDD{System.Double})">
            <summary>
            Add up the elements in this RDD.
@ -285,6 +401,12 @@
            <param name="self"></param>
            <returns></returns>
        </member>
+        <member name="T:Microsoft.Spark.CSharp.Core.OrderedRDDFunctions">
+            <summary>
+            Extra functions available on RDDs of (key, value) pairs where the key is sortable through
+            a function to sort the key.
+            </summary>
+        </member>
        <member name="M:Microsoft.Spark.CSharp.Core.OrderedRDDFunctions.SortByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Boolean,System.Nullable{System.Int32})">
            <summary>
            Sorts this RDD, which is assumed to consist of KeyValuePair pairs.
@ -4049,7 +4171,7 @@
            <param name="fromOffsets">Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream.</param>
            <returns>A DStream object</returns>
        </member>
-        <member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateDirectStreamWithRepartition(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.List{System.String},System.Collections.Generic.Dictionary{System.String,System.String},System.Collections.Generic.Dictionary{System.String,System.Int64},System.Int32)">
+        <member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateDirectStreamWithRepartition(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.List{System.String},System.Collections.Generic.Dictionary{System.String,System.String},System.Collections.Generic.Dictionary{System.String,System.Int64},System.UInt32)">
            <summary>
            Create an input stream that directly pulls messages from a Kafka Broker and specific offset.
            
@ -4077,9 +4199,6 @@
                user hint on how many kafka RDD partitions to create instead of aligning with kafka partitions,
                unbalanced kafka partitions and/or under-distributed data will be redistributed evenly across 
                a probably larger number of RDD partitions
-                if numPartitions > 0, then it will try to rebalance kafkaRDD based on this user hint.
-                if numPartitions == 0, then will fall back to the default behavior of KafkaRDD
-                else, then will try to reblance KafkaRDD according to spark.streaming.kafka.maxRatePerTask.* conf
            </param>
            <returns>A DStream object</returns>
        </member>
@ -4506,4 +4625,4 @@
            <typeparam name="U"></typeparam>
        </member>
    </members>
-</doc>
+</doc>
--- a/csharp/Adapter/documentation/Mobius_API_Documentation.md
+++ b/csharp/Adapter/documentation/Mobius_API_Documentation.md
@ -19,7 +19,22 @@
        
 ####Methods

-<table><tr><th>Name</th><th>Description</th></tr><tr><td><font color="blue"></font></td><td>Adds a term to this accumulator's value</td></tr><tr><td><font color="blue"></font></td><td>The += operator; adds a term to this accumulator's value</td></tr><tr><td><font color="blue"></font></td><td>Provide a "zero value" for the type</td></tr><tr><td><font color="blue"></font></td><td>Add two values of the accumulator's data type, returning a new value;</td></tr></table>
+<table><tr><th>Name</th><th>Description</th></tr><tr><td><font color="blue"></font></td><td>Adds a term to this accumulator's value</td></tr><tr><td><font color="blue"></font></td><td>The += operator; adds a term to this accumulator's value</td></tr><tr><td><font color="blue"></font></td><td>Creates and returns a string representation of the current accumulator</td></tr><tr><td><font color="blue"></font></td><td>Provide a "zero value" for the type</td></tr><tr><td><font color="blue"></font></td><td>Add two values of the accumulator's data type, returning a new value;</td></tr></table>
+
+---
+  
+  
+###<font color="#68228B">Microsoft.Spark.CSharp.Core.Accumulator`1</font>
+####Summary
+  
+            
+            A generic version of  where the element type is specified by the driver program.
+            
+            The type of element in the accumulator.
+        
+####Methods
+
+<table><tr><th>Name</th><th>Description</th></tr><tr><td><font color="blue">Add</font></td><td>Adds a term to this accumulator's value</td></tr><tr><td><font color="blue">op_Addition</font></td><td>The += operator; adds a term to this accumulator's value</td></tr><tr><td><font color="blue">ToString</font></td><td>Creates and returns a string representation of the current accumulator</td></tr></table>

 ---
  
@ -74,6 +89,21 @@
 ---
  
  
+###<font color="#68228B">Microsoft.Spark.CSharp.Core.Broadcast`1</font>
+####Summary
+  
+            
+            A generic version of  where the element can be specified.
+            
+            The type of element in Broadcast
+        
+####Methods
+
+<table><tr><th>Name</th><th>Description</th></tr><tr><td><font color="blue">Unpersist</font></td><td>Delete cached copies of this broadcast on the executors.</td></tr></table>
+
+---
+  
+  
 ###<font color="#68228B">Microsoft.Spark.CSharp.Core.Option`1</font>
 ####Summary
  
@ -83,6 +113,13 @@
            
            
        
+####Methods
+
+<table><tr><th>Name</th><th>Description</th></tr><tr><td><font color="blue">GetValue</font></td><td>Returns the value of the option if Option.IsDefined is TRUE; otherwise, throws an .</td></tr></table>
+
+---
+  
+  
 ###<font color="#68228B">Microsoft.Spark.CSharp.Core.Partitioner</font>
 ####Summary
  
@ -112,6 +149,35 @@
            Interface for collect operation on RDD
            
        
+###<font color="#68228B">Microsoft.Spark.CSharp.Core.DoubleRDDFunctions</font>
+####Summary
+  
+            
+            Extra functions available on RDDs of Doubles through an implicit conversion. 
+            
+        
+####Methods
+
+<table><tr><th>Name</th><th>Description</th></tr><tr><td><font color="blue">Sum</font></td><td>Add up the elements in this RDD. sc.Parallelize(new double[] {1.0, 2.0, 3.0}).Sum() 6.0</td></tr><tr><td><font color="blue">Stats</font></td><td>Return a object that captures the mean, variance and count of the RDD's elements in one operation.</td></tr><tr><td><font color="blue">Histogram</font></td><td>Compute a histogram using the provided buckets. The buckets are all open to the right except for the last which is closed. e.g. [1,10,20,50] means the buckets are [1,10) [10,20) [20,50], which means 1&lt;=x&lt;10, 10&lt;=x&lt;20, 20&lt;=x&lt;=50. And on the input of 1 and 50 we would have a histogram of 1,0,1. If your histogram is evenly spaced (e.g. [0, 10, 20, 30]), this can be switched from an O(log n) inseration to O(1) per element(where n = # buckets). Buckets must be sorted and not contain any duplicates, must be at least two elements. If `buckets` is a number, it will generates buckets which are evenly spaced between the minimum and maximum of the RDD. For example, if the min value is 0 and the max is 100, given buckets as 2, the resulting buckets will be [0,50) [50,100]. buckets must be at least 1 If the RDD contains infinity, NaN throws an exception If the elements in RDD do not vary (max == min) always returns a single bucket. It will return an tuple of buckets and histogram. &gt;&gt;&gt; rdd = sc.parallelize(range(51)) &gt;&gt;&gt; rdd.histogram(2) ([0, 25, 50], [25, 26]) &gt;&gt;&gt; rdd.histogram([0, 5, 25, 50]) ([0, 5, 25, 50], [5, 20, 26]) &gt;&gt;&gt; rdd.histogram([0, 15, 30, 45, 60]) # evenly spaced buckets ([0, 15, 30, 45, 60], [15, 15, 15, 6]) &gt;&gt;&gt; rdd = sc.parallelize(["ab", "ac", "b", "bd", "ef"]) &gt;&gt;&gt; rdd.histogram(("a", "b", "c")) (('a', 'b', 'c'), [2, 2])</td></tr><tr><td><font color="blue">Mean</font></td><td>Compute the mean of this RDD's elements. sc.Parallelize(new double[]{1, 2, 3}).Mean() 2.0</td></tr><tr><td><font color="blue">Variance</font></td><td>Compute the variance of this RDD's elements. sc.Parallelize(new double[]{1, 2, 3}).Variance() 0.666...</td></tr><tr><td><font color="blue">Stdev</font></td><td>Compute the standard deviation of this RDD's elements. sc.Parallelize(new double[]{1, 2, 3}).Stdev() 0.816...</td></tr><tr><td><font color="blue">SampleStdev</font></td><td>Compute the sample standard deviation of this RDD's elements (which corrects for bias in estimating the standard deviation by dividing by N-1 instead of N). sc.Parallelize(new double[]{1, 2, 3}).SampleStdev() 1.0</td></tr><tr><td><font color="blue">SampleVariance</font></td><td>Compute the sample variance of this RDD's elements (which corrects for bias in estimating the variance by dividing by N-1 instead of N). sc.Parallelize(new double[]{1, 2, 3}).SampleVariance() 1.0</td></tr></table>
+
+---
+  
+  
+###<font color="#68228B">Microsoft.Spark.CSharp.Core.OrderedRDDFunctions</font>
+####Summary
+  
+            
+            Extra functions available on RDDs of (key, value) pairs where the key is sortable through
+            a function to sort the key.
+            
+        
+####Methods
+
+<table><tr><th>Name</th><th>Description</th></tr><tr><td><font color="blue">SortByKey``2</font></td><td>Sorts this RDD, which is assumed to consist of KeyValuePair pairs.</td></tr><tr><td><font color="blue">SortByKey``3</font></td><td>Sorts this RDD, which is assumed to consist of KeyValuePairs. If key is type of string, case is sensitive.</td></tr><tr><td><font color="blue">repartitionAndSortWithinPartitions``2</font></td><td>Repartition the RDD according to the given partitioner and, within each resulting partition, sort records by their keys. This is more efficient than calling `repartition` and then sorting within each partition because it can push the sorting down into the shuffle machinery.</td></tr></table>
+
+---
+  
+  
 ###<font color="#68228B">Microsoft.Spark.CSharp.Core.PairRDDFunctions</font>
 ####Summary