merging latest changes from Microsoft/master

This commit is contained in:
Kaarthik Sivashanmugam 2017-01-28 18:40:14 -08:00 коммит произвёл GitHub
Родитель da69c1fef8 7d0f7ae1da
Коммит e14b92ba43
80 изменённых файлов: 1668 добавлений и 929 удалений

Просмотреть файл

@ -2,6 +2,8 @@ language: csharp
solution: csharp/SparkCLR.sln
sudo: required
dist: trusty
env:
- JDK=openjdk7
before_install:
- sudo apt-get install xsltproc
- nuget install NUnit.Runners -Version 3.0.0 -OutputDirectory testrunner
@ -12,6 +14,8 @@ before_install:
- export M2="$M2_HOME/bin"
- export PATH="$M2:$PATH"
- hash -r
before_script:
- jdk_switcher use $JDK
script:
- export MAVEN_OPTS="-XX:MaxPermSize=2g -Xmx4g"
- export JAVA_OPTS="-XX:MaxPermSize=2g -Xmx4g"

Просмотреть файл

@ -8,7 +8,7 @@ For example, the word count sample in Apache Spark can be implemented in C# as f
```c#
var lines = sparkContext.TextFile(@"hdfs://path/to/input.txt");
var words = lines.FlatMap(s => s.Split(' '));
var wordCounts = words.Map(w => new KeyValuePair<string, int>(w.Trim(), 1))
var wordCounts = words.Map(w => new Tuple<string, int>(w.Trim(), 1))
.ReduceByKey((x, y) => x + y);
var wordCountCollection = wordCounts.Collect();
wordCounts.SaveAsTextFile(@"hdfs://path/to/wordcount.txt");
@ -63,7 +63,7 @@ StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(checkpoint
.Map(kvp => Encoding.UTF8.GetString(kvp.Value))
.Filter(line => line.Contains(","))
.Map(line => line.Split(','))
.Map(columns => new KeyValuePair<string, int>(
.Map(columns => new Tuple<string, int>(
string.Format("{0},{1}", columns[0], columns[1]), 1))
.ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y,
windowDurationInSecs, slideDurationInSecs, 3)
@ -119,6 +119,7 @@ Refer to the [docs folder](docs) for design overview and other info on Mobius
* [Configuration parameters in Mobius](./notes/configuration-mobius.md)
* [Troubleshoot errors in Mobius](./notes/troubleshooting-mobius.md)
* [Debug Mobius apps](./notes/running-mobius-app.md#debug-mode)
* [Implementing Spark Apps in F# using Mobius](./notes/spark-fsharp-mobius.md)
## Supported Spark Versions

Просмотреть файл

@ -49,7 +49,8 @@ if "%precheck%" == "bad" (goto :EOF)
@rem
set SPARK_VERSION=2.0.0
set HADOOP_VERSION=2.6
@echo [RunSamples.cmd] SPARK_VERSION=%SPARK_VERSION%, HADOOP_VERSION=%HADOOP_VERSION%
set APACHE_DIST_SERVER=archive.apache.org
@echo [RunSamples.cmd] SPARK_VERSION=%SPARK_VERSION%, HADOOP_VERSION=%HADOOP_VERSION%, APACHE_DIST_SERVER=%APACHE_DIST_SERVER%
@rem download runtime dependencies
pushd "%CMDHOME%"

Просмотреть файл

@ -10,6 +10,9 @@
#
Param([string] $stage, [string] $verbose)
$envValue = [Environment]::GetEnvironmentVariable("APACHE_DIST_SERVER")
$apacheDistServer = if ($envValue -eq $null) { "archive.apache.org" } else { $envValue }
if ($stage.ToLower() -eq "run")
{
# retrieve hadoop and spark versions from environment variables
@ -19,7 +22,7 @@ if ($stage.ToLower() -eq "run")
$envValue = [Environment]::GetEnvironmentVariable("SPARK_VERSION")
$sparkVersion = if ($envValue -eq $null) { "1.6.1" } else { $envValue }
Write-Output "[downloadtools] hadoopVersion=$hadoopVersion, sparkVersion=$sparkVersion"
Write-Output "[downloadtools] hadoopVersion=$hadoopVersion, sparkVersion=$sparkVersion, apacheDistServer=$apacheDistServer"
}
function Get-ScriptDirectory
@ -73,8 +76,16 @@ function Download-File($url, $output)
$output = [System.IO.Path]::GetFullPath($output)
if (test-path $output)
{
Write-Output "[downloadtools.Download-File] $output exists. No need to download."
return
if ((Get-Item $output).Length -gt 0)
{
Write-Output "[downloadtools.Download-File] $output exists. No need to download."
return
}
else
{
Write-Output "[downloadtools.Download-File] [WARNING] $output exists but is empty. We need to download a new copy of the file."
Remove-Item $output
}
}
$start_time = Get-Date
@ -122,6 +133,11 @@ function Download-File($url, $output)
}
Write-Output "[downloadtools.Download-File] Download completed. Time taken: $howlong"
if ( !(test-path $output) -or (Get-Item $output).Length -eq 0)
{
throw [System.IO.FileNotFoundException] "Failed to download file $output from $url"
}
}
function Unzip-File($zipFile, $targetDir)
@ -252,7 +268,7 @@ function Download-BuildTools
$mvnCmd = "$toolsDir\$mvnVer\bin\mvn.cmd"
if (!(test-path $mvnCmd))
{
$url = "http://www.us.apache.org/dist/maven/maven-3/3.3.9/binaries/$mvnVer-bin.tar.gz"
$url = "http://$apacheDistServer/dist/maven/maven-3/3.3.9/binaries/$mvnVer-bin.tar.gz"
$output="$toolsDir\$mvnVer-bin.tar.gz"
Download-File $url $output
Untar-File $output $toolsDir
@ -402,7 +418,7 @@ function Download-RuntimeDependencies
$sparkSubmit="$S_HOME\bin\spark-submit.cmd"
if (!(test-path $sparkSubmit))
{
$url = "http://www.us.apache.org/dist/spark/spark-$sparkVersion/spark-$sparkVersion-bin-hadoop$hadoopVersion.tgz"
$url = "http://$apacheDistServer/dist/spark/spark-$sparkVersion/spark-$sparkVersion-bin-hadoop$hadoopVersion.tgz"
$output = "$toolsDir\spark-$sparkVersion-bin-hadoop$hadoopVersion.tgz"
Download-File $url $output
Untar-File $output $toolsDir

Просмотреть файл

@ -18,7 +18,8 @@ done
# setup Hadoop and Spark versions
export SPARK_VERSION=2.0.0
export HADOOP_VERSION=2.6
echo "[run-samples.sh] SPARK_VERSION=$SPARK_VERSION, HADOOP_VERSION=$HADOOP_VERSION"
export APACHE_DIST_SERVER=archive.apache.org
echo "[run-samples.sh] SPARK_VERSION=$SPARK_VERSION, HADOOP_VERSION=$HADOOP_VERSION, APACHE_DIST_SERVER=$APACHE_DIST_SERVER"
export FWDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
@ -30,7 +31,7 @@ export SPARK=spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION
export SPARK_HOME="$TOOLS_DIR/$SPARK"
if [ ! -d "$SPARK_HOME" ];
then
wget "http://www.us.apache.org/dist/spark/spark-$SPARK_VERSION/$SPARK.tgz" -O "$TOOLS_DIR/$SPARK.tgz"
wget "http://$APACHE_DIST_SERVER/dist/spark/spark-$SPARK_VERSION/$SPARK.tgz" -O "$TOOLS_DIR/$SPARK.tgz"
tar xfz "$TOOLS_DIR/$SPARK.tgz" -C "$TOOLS_DIR"
fi
export PATH="$SPARK_HOME/bin:$PATH"

Просмотреть файл

@ -157,6 +157,7 @@
<Compile Include="Sql\SparkSession.cs" />
<Compile Include="Sql\SqlContext.cs" />
<Compile Include="Sql\Types.cs" />
<Compile Include="Sql\UdfRegistration.cs" />
<Compile Include="Sql\UserDefinedFunction.cs" />
<Compile Include="Streaming\ConstantInputDStream.cs" />
<Compile Include="Streaming\DStream.cs" />

Просмотреть файл

@ -216,19 +216,19 @@ namespace Microsoft.Spark.CSharp.Core
for (int i = 0; i < numUpdates; i++)
{
var ms = new MemoryStream(SerDe.ReadBytes(ns));
KeyValuePair<int, dynamic> update = (KeyValuePair<int, dynamic>)formatter.Deserialize(ms);
var update = (Tuple<int, dynamic>)formatter.Deserialize(ms);
if (Accumulator.accumulatorRegistry.ContainsKey(update.Key))
if (Accumulator.accumulatorRegistry.ContainsKey(update.Item1))
{
Accumulator accumulator = Accumulator.accumulatorRegistry[update.Key];
accumulator.GetType().GetMethod("Add").Invoke(accumulator, new object[] { update.Value });
Accumulator accumulator = Accumulator.accumulatorRegistry[update.Item1];
accumulator.GetType().GetMethod("Add").Invoke(accumulator, new object[] { update.Item2 });
}
else
{
Console.Error.WriteLine("WARN: cann't find update.Key: {0} for accumulator, will create a new one", update.Key);
Console.Error.WriteLine("WARN: cann't find update.Key: {0} for accumulator, will create a new one", update.Item1);
var genericAccumulatorType = typeof(Accumulator<>);
var specificAccumulatorType = genericAccumulatorType.MakeGenericType(update.Value.GetType());
Activator.CreateInstance(specificAccumulatorType, new object[] { update.Key, update.Value });
var specificAccumulatorType = genericAccumulatorType.MakeGenericType(update.Item2.GetType());
Activator.CreateInstance(specificAccumulatorType, new object[] { update.Item1, update.Item2 });
}
}
ns.WriteByte((byte)1); // acknowledge byte other than -1

Просмотреть файл

@ -18,7 +18,7 @@ namespace Microsoft.Spark.CSharp.Core
{
/// <summary>
/// Sorts this RDD, which is assumed to consist of KeyValuePair pairs.
/// Sorts this RDD, which is assumed to consist of Tuple pairs.
/// </summary>
/// <typeparam name="K"></typeparam>
/// <typeparam name="V"></typeparam>
@ -26,13 +26,13 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="ascending"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, V>> SortByKey<K, V>(this RDD<KeyValuePair<K, V>> self,
public static RDD<Tuple<K, V>> SortByKey<K, V>(this RDD<Tuple<K, V>> self,
bool ascending = true, int? numPartitions = null)
{
return SortByKey<K, V, K>(self, ascending, numPartitions, new DefaultSortKeyFuncHelper<K>().Execute);
}
/// <summary>
/// Sorts this RDD, which is assumed to consist of KeyValuePairs. If key is type of string, case is sensitive.
/// Sorts this RDD, which is assumed to consist of Tuples. If Item1 is type of string, case is sensitive.
/// </summary>
/// <typeparam name="K"></typeparam>
/// <typeparam name="V"></typeparam>
@ -40,9 +40,9 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="self"></param>
/// <param name="ascending"></param>
/// <param name="numPartitions">Number of partitions. Each partition of the sorted RDD contains a sorted range of the elements.</param>
/// <param name="keyFunc">RDD will sort by keyFunc(key) for every key in KeyValuePair. Must not be null.</param>
/// <param name="keyFunc">RDD will sort by keyFunc(key) for every Item1 in Tuple. Must not be null.</param>
/// <returns></returns>
public static RDD<KeyValuePair<K, V>> SortByKey<K, V, U>(this RDD<KeyValuePair<K, V>> self,
public static RDD<Tuple<K, V>> SortByKey<K, V, U>(this RDD<Tuple<K, V>> self,
bool ascending, int? numPartitions, Func<K, U> keyFunc)
{
if (keyFunc == null)
@ -73,7 +73,7 @@ namespace Microsoft.Spark.CSharp.Core
/* first compute the boundary of each part via sampling: we want to partition
* the key-space into bins such that the bins have roughly the same
* number of (key, value) pairs falling into them */
U[] samples = self.Sample(false, fraction, 1).Map(kv => kv.Key).Collect().Select(k => keyFunc(k)).ToArray();
U[] samples = self.Sample(false, fraction, 1).Map(kv => kv.Item1).Collect().Select(k => keyFunc(k)).ToArray();
Array.Sort(samples, StringComparer.Ordinal); // case sensitive if key type is string
List<U> bounds = new List<U>();
@ -103,13 +103,13 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="partitionFunc"></param>
/// <param name="ascending"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, V>> repartitionAndSortWithinPartitions<K, V>(
this RDD<KeyValuePair<K, V>> self,
public static RDD<Tuple<K, V>> repartitionAndSortWithinPartitions<K, V>(
this RDD<Tuple<K, V>> self,
int? numPartitions = null,
Func<K, int> partitionFunc = null,
bool ascending = true)
{
return self.MapPartitionsWithIndex<KeyValuePair<K, V>>((pid, iter) => ascending ? iter.OrderBy(kv => kv.Key) : iter.OrderByDescending(kv => kv.Key));
return self.MapPartitionsWithIndex<Tuple<K, V>>((pid, iter) => ascending ? iter.OrderBy(kv => kv.Item1) : iter.OrderByDescending(kv => kv.Item1));
}
[Serializable]
@ -123,22 +123,22 @@ namespace Microsoft.Spark.CSharp.Core
this.ascending = ascending;
}
public IEnumerable<KeyValuePair<K, V>> Execute(int pid, IEnumerable<KeyValuePair<K, V>> kvs)
public IEnumerable<Tuple<K, V>> Execute(int pid, IEnumerable<Tuple<K, V>> kvs)
{
IEnumerable<KeyValuePair<K, V>> ordered;
IEnumerable<Tuple<K, V>> ordered;
if (ascending)
{
if (typeof(K) == typeof(string))
ordered = kvs.OrderBy(k => func(k.Key).ToString(), StringComparer.Ordinal);
ordered = kvs.OrderBy(k => func(k.Item1).ToString(), StringComparer.Ordinal);
else
ordered = kvs.OrderBy(k => func(k.Key));
ordered = kvs.OrderBy(k => func(k.Item1));
}
else
{
if (typeof(K) == typeof(string))
ordered = kvs.OrderByDescending(k => func(k.Key).ToString(), StringComparer.Ordinal);
ordered = kvs.OrderByDescending(k => func(k.Item1).ToString(), StringComparer.Ordinal);
else
ordered = kvs.OrderByDescending(k => func(k.Key));
ordered = kvs.OrderByDescending(k => func(k.Item1));
}
return ordered;
}

Просмотреть файл

@ -13,7 +13,7 @@ using Microsoft.Spark.CSharp.Interop.Ipc;
namespace Microsoft.Spark.CSharp.Core
{
/// <summary>
/// operations only available to KeyValuePair RDD
/// operations only available to Tuple RDD
///
/// See also http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions
/// </summary>
@ -22,7 +22,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <summary>
/// Return the key-value pairs in this RDD to the master as a dictionary.
///
/// var m = sc.Parallelize(new[] { new KeyValuePair&lt;int, int>(1, 2), new KeyValuePair&lt;int, int>(3, 4) }, 1).CollectAsMap()
/// var m = sc.Parallelize(new[] { new Tuple&lt;int, int>(1, 2), new Tuple&lt;int, int>(3, 4) }, 1).CollectAsMap()
/// m[1]
/// 2
/// m[3]
@ -33,30 +33,30 @@ namespace Microsoft.Spark.CSharp.Core
/// <typeparam name="V"></typeparam>
/// <param name="self"></param>
/// <returns></returns>
public static Dictionary<K, V> CollectAsMap<K, V>(this RDD<KeyValuePair<K, V>> self)
public static IDictionary<K, V> CollectAsMap<K, V>(this RDD<Tuple<K, V>> self)
{
return self.Collect().ToDictionary(kv => kv.Key, kv => kv.Value);
return self.Collect().ToDictionary(kv => kv.Item1, kv => kv.Item2);
}
/// <summary>
/// Return an RDD with the keys of each tuple.
///
/// >>> m = sc.Parallelize(new[] { new KeyValuePair&lt;int, int>(1, 2), new KeyValuePair&lt;int, int>(3, 4) }, 1).Keys().Collect()
/// >>> m = sc.Parallelize(new[] { new Tuple&lt;int, int>(1, 2), new Tuple&lt;int, int>(3, 4) }, 1).Keys().Collect()
/// [1, 3]
/// </summary>
/// <typeparam name="K"></typeparam>
/// <typeparam name="V"></typeparam>
/// <param name="self"></param>
/// <returns></returns>
public static RDD<K> Keys<K, V>(this RDD<KeyValuePair<K, V>> self)
public static RDD<K> Keys<K, V>(this RDD<Tuple<K, V>> self)
{
return self.Map<K>(kv => kv.Key);
return self.Map<K>(kv => kv.Item1);
}
/// <summary>
/// Return an RDD with the values of each tuple.
///
/// >>> m = sc.Parallelize(new[] { new KeyValuePair&lt;int, int>(1, 2), new KeyValuePair&lt;int, int>(3, 4) }, 1).Values().Collect()
/// >>> m = sc.Parallelize(new[] { new Tuple&lt;int, int>(1, 2), new Tuple&lt;int, int>(3, 4) }, 1).Values().Collect()
/// [2, 4]
///
/// </summary>
@ -64,9 +64,9 @@ namespace Microsoft.Spark.CSharp.Core
/// <typeparam name="V"></typeparam>
/// <param name="self"></param>
/// <returns></returns>
public static RDD<V> Values<K, V>(this RDD<KeyValuePair<K, V>> self)
public static RDD<V> Values<K, V>(this RDD<Tuple<K, V>> self)
{
return self.Map<V>(kv => kv.Value);
return self.Map<V>(kv => kv.Item2);
}
/// <summary>
@ -80,9 +80,9 @@ namespace Microsoft.Spark.CSharp.Core
///
/// sc.Parallelize(new[]
/// {
/// new KeyValuePair&lt;string, int>("a", 1),
/// new KeyValuePair&lt;string, int>("b", 1),
/// new KeyValuePair&lt;string, int>("a", 1)
/// new Tuple&lt;string, int>("a", 1),
/// new Tuple&lt;string, int>("b", 1),
/// new Tuple&lt;string, int>("a", 1)
/// }, 2)
/// .ReduceByKey((x, y) => x + y).Collect()
///
@ -95,9 +95,13 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="reduceFunc"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, V>> ReduceByKey<K, V>(this RDD<KeyValuePair<K, V>> self, Func<V, V, V> reduceFunc, int numPartitions = 0)
public static RDD<Tuple<K, V>> ReduceByKey<K, V>(this RDD<Tuple<K, V>> self, Func<V, V, V> reduceFunc, int numPartitions = 0)
{
return CombineByKey(self, () => default(V), reduceFunc, reduceFunc, numPartitions);
var locallyCombined = self.MapPartitionsWithIndex(new GroupByMergeHelper<K, V>(reduceFunc).Execute, true);
var shuffled = locallyCombined.PartitionBy(numPartitions);
return shuffled.MapPartitionsWithIndex(new GroupByMergeHelper<K, V>(reduceFunc).Execute, true);
}
/// <summary>
@ -109,9 +113,9 @@ namespace Microsoft.Spark.CSharp.Core
///
/// sc.Parallelize(new[]
/// {
/// new KeyValuePair&lt;string, int>("a", 1),
/// new KeyValuePair&lt;string, int>("b", 1),
/// new KeyValuePair&lt;string, int>("a", 1)
/// new Tuple&lt;string, int>("a", 1),
/// new Tuple&lt;string, int>("b", 1),
/// new Tuple&lt;string, int>("a", 1)
/// }, 2)
/// .ReduceByKeyLocally((x, y) => x + y).Collect()
///
@ -123,7 +127,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="self"></param>
/// <param name="reduceFunc"></param>
/// <returns></returns>
public static Dictionary<K, V> ReduceByKeyLocally<K, V>(this RDD<KeyValuePair<K, V>> self, Func<V, V, V> reduceFunc)
public static IDictionary<K, V> ReduceByKeyLocally<K, V>(this RDD<Tuple<K, V>> self, Func<V, V, V> reduceFunc)
{
return ReduceByKey(self, reduceFunc).CollectAsMap();
}
@ -133,9 +137,9 @@ namespace Microsoft.Spark.CSharp.Core
///
/// sc.Parallelize(new[]
/// {
/// new KeyValuePair&lt;string, int>("a", 1),
/// new KeyValuePair&lt;string, int>("b", 1),
/// new KeyValuePair&lt;string, int>("a", 1)
/// new Tuple&lt;string, int>("a", 1),
/// new Tuple&lt;string, int>("b", 1),
/// new Tuple&lt;string, int>("a", 1)
/// }, 2)
/// .CountByKey((x, y) => x + y).Collect()
///
@ -146,9 +150,9 @@ namespace Microsoft.Spark.CSharp.Core
/// <typeparam name="V"></typeparam>
/// <param name="self"></param>
/// <returns></returns>
public static Dictionary<K, long> CountByKey<K, V>(this RDD<KeyValuePair<K, V>> self)
public static IEnumerable<Tuple<K, long>> CountByKey<K, V>(this RDD<Tuple<K, V>> self)
{
return self.MapValues(v => 1L).ReduceByKey((a, b) => a + b).CollectAsMap();
return self.MapValues(v => 1L).ReduceByKey((a, b) => a + b).Collect();
}
/// <summary>
@ -159,9 +163,9 @@ namespace Microsoft.Spark.CSharp.Core
/// Performs a hash join across the cluster.
///
/// var l = sc.Parallelize(
/// new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 1);
/// new[] { new Tuple&lt;string, int>("a", 1), new Tuple&lt;string, int>("b", 4) }, 1);
/// var r = sc.Parallelize(
/// new[] { new KeyValuePair&lt;string, int>("a", 2), new KeyValuePair&lt;string, int>("a", 3) }, 1);
/// new[] { new Tuple&lt;string, int>("a", 2), new Tuple&lt;string, int>("a", 3) }, 1);
/// var m = l.Join(r, 2).Collect();
///
/// [('a', (1, 2)), ('a', (1, 3))]
@ -174,9 +178,9 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="other"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, Tuple<V, W>>> Join<K, V, W>(
this RDD<KeyValuePair<K, V>> self,
RDD<KeyValuePair<K, W>> other,
public static RDD<Tuple<K, Tuple<V, W>>> Join<K, V, W>(
this RDD<Tuple<K, V>> self,
RDD<Tuple<K, W>> other,
int numPartitions = 0)
{
return self.GroupWith(other, numPartitions).FlatMapValues(
@ -194,9 +198,9 @@ namespace Microsoft.Spark.CSharp.Core
/// Hash-partitions the resulting RDD into the given number of partitions.
///
/// var l = sc.Parallelize(
/// new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 1);
/// new[] { new Tuple&lt;string, int>("a", 1), new Tuple&lt;string, int>("b", 4) }, 1);
/// var r = sc.Parallelize(
/// new[] { new KeyValuePair&lt;string, int>("a", 2) }, 1);
/// new[] { new Tuple&lt;string, int>("a", 2) }, 1);
/// var m = l.LeftOuterJoin(r).Collect();
///
/// [('a', (1, 2)), ('b', (4, Option))]
@ -209,9 +213,9 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="other"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, Tuple<V, Option<W>>>> LeftOuterJoin<K, V, W>(
this RDD<KeyValuePair<K, V>> self,
RDD<KeyValuePair<K, W>> other,
public static RDD<Tuple<K, Tuple<V, Option<W>>>> LeftOuterJoin<K, V, W>(
this RDD<Tuple<K, V>> self,
RDD<Tuple<K, W>> other,
int numPartitions = 0)
{
return self.GroupWith(other, numPartitions).FlatMapValues(
@ -228,9 +232,9 @@ namespace Microsoft.Spark.CSharp.Core
/// Hash-partitions the resulting RDD into the given number of partitions.
///
/// var l = sc.Parallelize(
/// new[] { new KeyValuePair&lt;string, int>("a", 2) }, 1);
/// new[] { new Tuple&lt;string, int>("a", 2) }, 1);
/// var r = sc.Parallelize(
/// new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 1);
/// new[] { new Tuple&lt;string, int>("a", 1), new Tuple&lt;string, int>("b", 4) }, 1);
/// var m = l.RightOuterJoin(r).Collect();
///
/// [('a', (2, 1)), ('b', (Option, 4))]
@ -243,9 +247,9 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="other"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, Tuple<Option<V>, W>>> RightOuterJoin<K, V, W>(
this RDD<KeyValuePair<K, V>> self,
RDD<KeyValuePair<K, W>> other,
public static RDD<Tuple<K, Tuple<Option<V>, W>>> RightOuterJoin<K, V, W>(
this RDD<Tuple<K, V>> self,
RDD<Tuple<K, W>> other,
int numPartitions = 0)
{
return self.GroupWith(other, numPartitions).FlatMapValues(
@ -267,9 +271,9 @@ namespace Microsoft.Spark.CSharp.Core
/// Hash-partitions the resulting RDD into the given number of partitions.
///
/// var l = sc.Parallelize(
/// new[] { new KeyValuePair&lt;string, int>("a", 1), KeyValuePair&lt;string, int>("b", 4) }, 1);
/// new[] { new Tuple&lt;string, int>("a", 1), Tuple&lt;string, int>("b", 4) }, 1);
/// var r = sc.Parallelize(
/// new[] { new KeyValuePair&lt;string, int>("a", 2), new KeyValuePair&lt;string, int>("c", 8) }, 1);
/// new[] { new Tuple&lt;string, int>("a", 2), new Tuple&lt;string, int>("c", 8) }, 1);
/// var m = l.FullOuterJoin(r).Collect();
///
/// [('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))]
@ -282,9 +286,9 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="other"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, Tuple<Option<V>, Option<W>>>> FullOuterJoin<K, V, W>(
this RDD<KeyValuePair<K, V>> self,
RDD<KeyValuePair<K, W>> other,
public static RDD<Tuple<K, Tuple<Option<V>, Option<W>>>> FullOuterJoin<K, V, W>(
this RDD<Tuple<K, V>> self,
RDD<Tuple<K, W>> other,
int numPartitions = 0)
{
return self.GroupWith(other, numPartitions).FlatMapValues(
@ -295,13 +299,13 @@ namespace Microsoft.Spark.CSharp.Core
/// <summary>
/// Return a copy of the RDD partitioned using the specified partitioner.
///
/// sc.Parallelize(new[] { 1, 2, 3, 4, 2, 4, 1 }, 1).Map(x => new KeyValuePair&lt;int, int>(x, x)).PartitionBy(3).Glom().Collect()
/// sc.Parallelize(new[] { 1, 2, 3, 4, 2, 4, 1 }, 1).Map(x => new Tuple&lt;int, int>(x, x)).PartitionBy(3).Glom().Collect()
/// </summary>
/// <param name="self"></param>
/// <param name="numPartitions"></param>
/// <param name="partitionFunc"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, V>> PartitionBy<K, V>(this RDD<KeyValuePair<K, V>> self, int numPartitions = 0,
public static RDD<Tuple<K, V>> PartitionBy<K, V>(this RDD<Tuple<K, V>> self, int numPartitions = 0,
Func<dynamic, int> partitionFunc = null)
{
if (numPartitions == 0)
@ -318,7 +322,7 @@ namespace Microsoft.Spark.CSharp.Core
// convert shuffling version of RDD[(Long, Array[Byte])] back to normal RDD[Array[Byte]]
// invoking property keyed.RddProxy marks the end of current pipeline RDD after shuffling
// and potentially starts next pipeline RDD with defult SerializedMode.Byte
var rdd = new RDD<KeyValuePair<K, V>>(self.sparkContext.SparkContextProxy.CreatePairwiseRDD(keyed.RddProxy, numPartitions,
var rdd = new RDD<Tuple<K, V>>(self.sparkContext.SparkContextProxy.CreatePairwiseRDD(keyed.RddProxy, numPartitions,
GenerateObjectId(partitionFunc)), self.sparkContext);
rdd.partitioner = partitioner;
@ -346,9 +350,9 @@ namespace Microsoft.Spark.CSharp.Core
/// sc.Parallelize(
/// new[]
/// {
/// new KeyValuePair&lt;string, int>("a", 1),
/// new KeyValuePair&lt;string, int>("b", 1),
/// new KeyValuePair&lt;string, int>("a", 1)
/// new Tuple&lt;string, int>("a", 1),
/// new Tuple&lt;string, int>("b", 1),
/// new Tuple&lt;string, int>("a", 1)
/// }, 2)
/// .CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
///
@ -363,8 +367,8 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="mergeCombiners"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, C>> CombineByKey<K, V, C>(
this RDD<KeyValuePair<K, V>> self,
public static RDD<Tuple<K, C>> CombineByKey<K, V, C>(
this RDD<Tuple<K, V>> self,
Func<C> createCombiner,
Func<C, V, C> mergeValue,
Func<C, C, C> mergeCombiners,
@ -389,9 +393,9 @@ namespace Microsoft.Spark.CSharp.Core
/// sc.Parallelize(
/// new[]
/// {
/// new KeyValuePair&lt;string, int>("a", 1),
/// new KeyValuePair&lt;string, int>("b", 1),
/// new KeyValuePair&lt;string, int>("a", 1)
/// new Tuple&lt;string, int>("a", 1),
/// new Tuple&lt;string, int>("b", 1),
/// new Tuple&lt;string, int>("a", 1)
/// }, 2)
/// .CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
///
@ -406,8 +410,8 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="combOp"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, U>> AggregateByKey<K, V, U>(
this RDD<KeyValuePair<K, V>> self,
public static RDD<Tuple<K, U>> AggregateByKey<K, V, U>(
this RDD<Tuple<K, V>> self,
Func<U> zeroValue,
Func<U, V, U> seqOp,
Func<U, U, U> combOp,
@ -425,9 +429,9 @@ namespace Microsoft.Spark.CSharp.Core
/// sc.Parallelize(
/// new[]
/// {
/// new KeyValuePair&lt;string, int>("a", 1),
/// new KeyValuePair&lt;string, int>("b", 1),
/// new KeyValuePair&lt;string, int>("a", 1)
/// new Tuple&lt;string, int>("a", 1),
/// new Tuple&lt;string, int>("b", 1),
/// new Tuple&lt;string, int>("a", 1)
/// }, 2)
/// .CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
///
@ -440,8 +444,8 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="func"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, V>> FoldByKey<K, V>(
this RDD<KeyValuePair<K, V>> self,
public static RDD<Tuple<K, V>> FoldByKey<K, V>(
this RDD<Tuple<K, V>> self,
Func<V> zeroValue,
Func<V, V, V> func,
int numPartitions = 0)
@ -460,9 +464,9 @@ namespace Microsoft.Spark.CSharp.Core
/// sc.Parallelize(
/// new[]
/// {
/// new KeyValuePair&lt;string, int>("a", 1),
/// new KeyValuePair&lt;string, int>("b", 1),
/// new KeyValuePair&lt;string, int>("a", 1)
/// new Tuple&lt;string, int>("a", 1),
/// new Tuple&lt;string, int>("b", 1),
/// new Tuple&lt;string, int>("a", 1)
/// }, 2)
/// .GroupByKey().MapValues(l => string.Join(" ", l)).Collect()
///
@ -474,7 +478,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="self"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, List<V>>> GroupByKey<K, V>(this RDD<KeyValuePair<K, V>> self, int numPartitions = 0)
public static RDD<Tuple<K, List<V>>> GroupByKey<K, V>(this RDD<Tuple<K, V>> self, int numPartitions = 0)
{
return CombineByKey(self,
() => new List<V>(),
@ -490,8 +494,8 @@ namespace Microsoft.Spark.CSharp.Core
/// sc.Parallelize(
/// new[]
/// {
/// new KeyValuePair&lt;string, string[]>("a", new[]{"apple", "banana", "lemon"}),
/// new KeyValuePair&lt;string, string[]>("b", new[]{"grapes"})
/// new Tuple&lt;string, string[]>("a", new[]{"apple", "banana", "lemon"}),
/// new Tuple&lt;string, string[]>("b", new[]{"grapes"})
/// }, 2)
/// .MapValues(x => x.Length).Collect()
///
@ -504,7 +508,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="self"></param>
/// <param name="func"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, U>> MapValues<K, V, U>(this RDD<KeyValuePair<K, V>> self, Func<V, U> func)
public static RDD<Tuple<K, U>> MapValues<K, V, U>(this RDD<Tuple<K, V>> self, Func<V, U> func)
{
return self.Map(new MapValuesHelper<K, V, U>(func).Execute, true);
}
@ -516,8 +520,8 @@ namespace Microsoft.Spark.CSharp.Core
/// x = sc.Parallelize(
/// new[]
/// {
/// new KeyValuePair&lt;string, string[]>("a", new[]{"x", "y", "z"}),
/// new KeyValuePair&lt;string, string[]>("b", new[]{"p", "r"})
/// new Tuple&lt;string, string[]>("a", new[]{"x", "y", "z"}),
/// new Tuple&lt;string, string[]>("b", new[]{"p", "r"})
/// }, 2)
/// .FlatMapValues(x => x).Collect()
///
@ -530,13 +534,13 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="self"></param>
/// <param name="func"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, U>> FlatMapValues<K, V, U>(this RDD<KeyValuePair<K, V>> self, Func<V, IEnumerable<U>> func)
public static RDD<Tuple<K, U>> FlatMapValues<K, V, U>(this RDD<Tuple<K, V>> self, Func<V, IEnumerable<U>> func)
{
return self.FlatMap(new FlatMapValuesHelper<K, V, U>(func).Execute, true);
}
/// <summary>
/// explicitly convert KeyValuePair&lt;K, V> to KeyValuePair&lt;K, dynamic>
/// explicitly convert Tuple&lt;K, V> to Tuple&lt;K, dynamic>
/// since they are incompatibles types unlike V to dynamic
/// </summary>
/// <typeparam name="K"></typeparam>
@ -546,10 +550,10 @@ namespace Microsoft.Spark.CSharp.Core
/// <typeparam name="W3"></typeparam>
/// <param name="self"></param>
/// <returns></returns>
private static RDD<KeyValuePair<K, dynamic>> MapPartitionsWithIndex<K, V, W1, W2, W3>(this RDD<KeyValuePair<K, dynamic>> self)
private static RDD<Tuple<K, dynamic>> MapPartitionsWithIndex<K, V, W1, W2, W3>(this RDD<Tuple<K, dynamic>> self)
{
CSharpWorkerFunc csharpWorkerFunc = new CSharpWorkerFunc(new DynamicTypingWrapper<K, V, W1, W2, W3>().Execute);
var pipelinedRDD = new PipelinedRDD<KeyValuePair<K, dynamic>>
var pipelinedRDD = new PipelinedRDD<Tuple<K, dynamic>>
{
workerFunc = csharpWorkerFunc,
preservesPartitioning = true,
@ -568,8 +572,8 @@ namespace Microsoft.Spark.CSharp.Core
/// For each key k in this RDD or <paramref name="other"/>, return a resulting RDD that
/// contains a tuple with the list of values for that key in this RDD as well as <paramref name="other"/>.
///
/// var x = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 2);
/// var y = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 2) }, 1);
/// var x = sc.Parallelize(new[] { new Tuple&lt;string, int>("a", 1), new Tuple&lt;string, int>("b", 4) }, 2);
/// var y = sc.Parallelize(new[] { new Tuple&lt;string, int>("a", 2) }, 1);
/// x.GroupWith(y).Collect();
///
/// [('a', ([1], [2])), ('b', ([4], []))]
@ -582,16 +586,16 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="other"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, Tuple<List<V>, List<W>>>> GroupWith<K, V, W>(
this RDD<KeyValuePair<K, V>> self,
RDD<KeyValuePair<K, W>> other,
public static RDD<Tuple<K, Tuple<List<V>, List<W>>>> GroupWith<K, V, W>(
this RDD<Tuple<K, V>> self,
RDD<Tuple<K, W>> other,
int numPartitions = 0)
{
// MapValues, which introduces extra CSharpRDD, is not necessary when union different RDD types
if (typeof(V) != typeof(W))
{
return self.ConvertTo<KeyValuePair<K, dynamic>>()
.Union(other.ConvertTo<KeyValuePair<K, dynamic>>())
return self.ConvertTo<Tuple<K, dynamic>>()
.Union(other.ConvertTo<Tuple<K, dynamic>>())
.MapPartitionsWithIndex<K, V, W, W, W>()
.CombineByKey(
() => new Tuple<List<V>, List<W>>(new List<V>(), new List<W>()),
@ -610,9 +614,9 @@ namespace Microsoft.Spark.CSharp.Core
}
/// <summary>
/// var x = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 5), new KeyValuePair&lt;string, int>("b", 6) }, 2);
/// var y = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 2);
/// var z = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 2) }, 1);
/// var x = sc.Parallelize(new[] { new Tuple&lt;string, int>("a", 5), new Tuple&lt;string, int>("b", 6) }, 2);
/// var y = sc.Parallelize(new[] { new Tuple&lt;string, int>("a", 1), new Tuple&lt;string, int>("b", 4) }, 2);
/// var z = sc.Parallelize(new[] { new Tuple&lt;string, int>("a", 2) }, 1);
/// x.GroupWith(y, z).Collect();
/// </summary>
/// <typeparam name="K"></typeparam>
@ -624,18 +628,18 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="other2"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, Tuple<List<V>, List<W1>, List<W2>>>> GroupWith<K, V, W1, W2>(
this RDD<KeyValuePair<K, V>> self,
RDD<KeyValuePair<K, W1>> other1,
RDD<KeyValuePair<K, W2>> other2,
public static RDD<Tuple<K, Tuple<List<V>, List<W1>, List<W2>>>> GroupWith<K, V, W1, W2>(
this RDD<Tuple<K, V>> self,
RDD<Tuple<K, W1>> other1,
RDD<Tuple<K, W2>> other2,
int numPartitions = 0)
{
// MapValues, which introduces extra CSharpRDD, is not necessary when union different RDD types
if (!(typeof(V) == typeof(W1) && typeof(V) == typeof(W2)))
{
return self.ConvertTo<KeyValuePair<K, dynamic>>()
.Union(other1.ConvertTo<KeyValuePair<K, dynamic>>())
.Union(other2.ConvertTo<KeyValuePair<K, dynamic>>())
return self.ConvertTo<Tuple<K, dynamic>>()
.Union(other1.ConvertTo<Tuple<K, dynamic>>())
.Union(other2.ConvertTo<Tuple<K, dynamic>>())
.MapPartitionsWithIndex<K, V, W1, W2, W2>()
.CombineByKey(
() => new Tuple<List<V>, List<W1>, List<W2>>(new List<V>(), new List<W1>(), new List<W2>()),
@ -655,10 +659,10 @@ namespace Microsoft.Spark.CSharp.Core
}
/// <summary>
/// var x = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 5), new KeyValuePair&lt;string, int>("b", 6) }, 2);
/// var y = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 2);
/// var z = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 2) }, 1);
/// var w = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("b", 42) }, 1);
/// var x = sc.Parallelize(new[] { new Tuple&lt;string, int>("a", 5), new Tuple&lt;string, int>("b", 6) }, 2);
/// var y = sc.Parallelize(new[] { new Tuple&lt;string, int>("a", 1), new Tuple&lt;string, int>("b", 4) }, 2);
/// var z = sc.Parallelize(new[] { new Tuple&lt;string, int>("a", 2) }, 1);
/// var w = sc.Parallelize(new[] { new Tuple&lt;string, int>("b", 42) }, 1);
/// var m = x.GroupWith(y, z, w).MapValues(l => string.Join(" ", l.Item1) + " : " + string.Join(" ", l.Item2) + " : " + string.Join(" ", l.Item3) + " : " + string.Join(" ", l.Item4)).Collect();
/// </summary>
/// <typeparam name="K"></typeparam>
@ -672,20 +676,20 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="other3"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, Tuple<List<V>, List<W1>, List<W2>, List<W3>>>> GroupWith<K, V, W1, W2, W3>(
this RDD<KeyValuePair<K, V>> self,
RDD<KeyValuePair<K, W1>> other1,
RDD<KeyValuePair<K, W2>> other2,
RDD<KeyValuePair<K, W3>> other3,
public static RDD<Tuple<K, Tuple<List<V>, List<W1>, List<W2>, List<W3>>>> GroupWith<K, V, W1, W2, W3>(
this RDD<Tuple<K, V>> self,
RDD<Tuple<K, W1>> other1,
RDD<Tuple<K, W2>> other2,
RDD<Tuple<K, W3>> other3,
int numPartitions = 0)
{
// MapValues, which introduces extra CSharpRDD, is not necessary when union different RDD types
if (!(typeof(V) == typeof(W1) && typeof(V) == typeof(W2)))
{
return self.ConvertTo<KeyValuePair<K, dynamic>>()
.Union(other1.ConvertTo<KeyValuePair<K, dynamic>>())
.Union(other2.ConvertTo<KeyValuePair<K, dynamic>>())
.Union(other3.ConvertTo<KeyValuePair<K, dynamic>>())
return self.ConvertTo<Tuple<K, dynamic>>()
.Union(other1.ConvertTo<Tuple<K, dynamic>>())
.Union(other2.ConvertTo<Tuple<K, dynamic>>())
.Union(other3.ConvertTo<Tuple<K, dynamic>>())
.MapPartitionsWithIndex<K, V, W1, W2, W3>()
.CombineByKey(
() => new Tuple<List<V>, List<W1>, List<W2>, List<W3>>(new List<V>(), new List<W1>(), new List<W2>(), new List<W3>()),
@ -713,7 +717,7 @@ namespace Microsoft.Spark.CSharp.Core
// ///
// /// var fractions = new <see cref="Dictionary{string, double}"/> { { "a", 0.2 }, { "b", 0.1 } };
// /// var rdd = sc.Parallelize(fractions.Keys.ToArray(), 2).Cartesian(sc.Parallelize(Enumerable.Range(0, 1000), 2));
// /// var sample = rdd.Map(t => new KeyValuePair&lt;string, int>(t.Item1, t.Item2)).SampleByKey(false, fractions, 2).GroupByKey().Collect();
// /// var sample = rdd.Map(t => new Tuple&lt;string, int>(t.Item1, t.Item2)).SampleByKey(false, fractions, 2).GroupByKey().Collect();
// ///
// /// 100 &lt; sample["a"].Length &lt; 300 and 50 &lt; sample["b"].Length &lt; 150
// /// true
@ -730,8 +734,8 @@ namespace Microsoft.Spark.CSharp.Core
// /// <param name="fractions"></param>
// /// <param name="seed"></param>
// /// <returns></returns>
//public static RDD<KeyValuePair<string, V>> SampleByKey<V>(
// this RDD<KeyValuePair<string, V>> self,
//public static RDD<Tuple<string, V>> SampleByKey<V>(
// this RDD<Tuple<string, V>> self,
// bool withReplacement,
// Dictionary<string, double> fractions,
// long seed)
@ -739,14 +743,14 @@ namespace Microsoft.Spark.CSharp.Core
// if (fractions.Any(f => f.Value < 0.0))
// throw new ArgumentException(string.Format("Negative fraction value found in: {0}", string.Join(",", fractions.Values.ToArray())));
// return new RDD<KeyValuePair<string, V>>(self.RddProxy.SampleByKey(withReplacement, fractions, seed), self.sparkContext);
// return new RDD<Tuple<string, V>>(self.RddProxy.SampleByKey(withReplacement, fractions, seed), self.sparkContext);
//}
/// <summary>
/// Return each (key, value) pair in this RDD that has no pair with matching key in <paramref name="other"/>.
///
/// var x = sc.Parallelize(new[] { new KeyValuePair&lt;string, int?>("a", 1), new KeyValuePair&lt;string, int?>("b", 4), new KeyValuePair&lt;string, int?>("b", 5), new KeyValuePair&lt;string, int?>("a", 2) }, 2);
/// var y = sc.Parallelize(new[] { new KeyValuePair&lt;string, int?>("a", 3), new KeyValuePair&lt;string, int?>("c", null) }, 2);
/// var x = sc.Parallelize(new[] { new Tuple&lt;string, int?>("a", 1), new Tuple&lt;string, int?>("b", 4), new Tuple&lt;string, int?>("b", 5), new Tuple&lt;string, int?>("a", 2) }, 2);
/// var y = sc.Parallelize(new[] { new Tuple&lt;string, int?>("a", 3), new Tuple&lt;string, int?>("c", null) }, 2);
/// x.SubtractByKey(y).Collect();
///
/// [('b', 4), ('b', 5)]
@ -759,7 +763,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="other"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, V>> SubtractByKey<K, V, W>(this RDD<KeyValuePair<K, V>> self, RDD<KeyValuePair<K, W>> other, int numPartitions = 0)
public static RDD<Tuple<K, V>> SubtractByKey<K, V, W>(this RDD<Tuple<K, V>> self, RDD<Tuple<K, W>> other, int numPartitions = 0)
{
return self.GroupWith(other, numPartitions).FlatMapValues(t => t.Item1.Where(v => t.Item2.Count == 0));
}
@ -770,7 +774,7 @@ namespace Microsoft.Spark.CSharp.Core
/// searching the partition that the key maps to.
///
/// >>> l = range(1000)
/// >>> rdd = sc.Parallelize(Enumerable.Range(0, 1000).Zip(Enumerable.Range(0, 1000), (x, y) => new KeyValuePair&lt;int, int>(x, y)), 10)
/// >>> rdd = sc.Parallelize(Enumerable.Range(0, 1000).Zip(Enumerable.Range(0, 1000), (x, y) => new Tuple&lt;int, int>(x, y)), 10)
/// >>> rdd.lookup(42)
/// [42]
///
@ -780,7 +784,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="self"></param>
/// <param name="key"></param>
/// <returns></returns>
public static V[] Lookup<K, V>(this RDD<KeyValuePair<K, V>> self, K key)
public static V[] Lookup<K, V>(this RDD<Tuple<K, V>> self, K key)
{
return self.Filter(new LookupHelper<K, V>(key).Execute).Values().Collect();
}
@ -795,7 +799,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <typeparam name="V"></typeparam>
/// <param name="self"></param>
/// <param name="conf">Hadoop job configuration, passed in as a dict</param>
public static void SaveAsNewAPIHadoopDataset<K, V>(this RDD<KeyValuePair<K, V>> self, IEnumerable<KeyValuePair<string, string>> conf)
public static void SaveAsNewAPIHadoopDataset<K, V>(this RDD<Tuple<K, V>> self, IEnumerable<Tuple<string, string>> conf)
{
self.RddProxy.SaveAsNewAPIHadoopDataset(conf);
}
@ -811,7 +815,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="keyClass">fully qualified classname of key Writable class (e.g. "org.apache.hadoop.io.IntWritable", None by default)</param>
/// <param name="valueClass">fully qualified classname of value Writable class (e.g. "org.apache.hadoop.io.Text", None by default)</param>
/// <param name="conf">Hadoop job configuration, passed in as a dict (None by default)</param>
public static void SaveAsNewAPIHadoopFile<K, V>(this RDD<KeyValuePair<K, V>> self, string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf)
public static void SaveAsNewAPIHadoopFile<K, V>(this RDD<Tuple<K, V>> self, string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<Tuple<string, string>> conf)
{
self.RddProxy.SaveAsNewAPIHadoopFile(path, outputFormatClass, keyClass, valueClass, conf);
}
@ -826,7 +830,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <typeparam name="V"></typeparam>
/// <param name="self"></param>
/// <param name="conf">Hadoop job configuration, passed in as a dict</param>
public static void SaveAsHadoopDataset<K, V>(this RDD<KeyValuePair<K, V>> self, IEnumerable<KeyValuePair<string, string>> conf)
public static void SaveAsHadoopDataset<K, V>(this RDD<Tuple<K, V>> self, IEnumerable<Tuple<string, string>> conf)
{
self.RddProxy.SaveAsHadoopDataset(conf);
}
@ -848,7 +852,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="valueClass">fully qualified classname of value Writable class (e.g. "org.apache.hadoop.io.Text", None by default)</param>
/// <param name="conf">(None by default)</param>
/// <param name="compressionCodecClass">(None by default)</param>
public static void SaveAsHadoopFile<K, V>(this RDD<KeyValuePair<K, V>> self, string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf, string compressionCodecClass)
public static void SaveAsHadoopFile<K, V>(this RDD<Tuple<K, V>> self, string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<Tuple<string, string>> conf, string compressionCodecClass)
{
self.RddProxy.SaveAsHadoopFile(path, outputFormatClass, keyClass, valueClass, conf, compressionCodecClass);
}
@ -867,7 +871,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="self"></param>
/// <param name="path">path to sequence file</param>
/// <param name="compressionCodecClass">(None by default)</param>
public static void SaveAsSequenceFile<K, V>(this RDD<KeyValuePair<K, V>> self, string path, string compressionCodecClass)
public static void SaveAsSequenceFile<K, V>(this RDD<Tuple<K, V>> self, string path, string compressionCodecClass)
{
self.RddProxy.SaveAsSequenceFile(path, compressionCodecClass);
}
@ -887,12 +891,12 @@ namespace Microsoft.Spark.CSharp.Core
mergeCombiners = mc;
}
public IEnumerable<KeyValuePair<K, C>> Execute(int pid, IEnumerable<KeyValuePair<K, C>> input)
public IEnumerable<Tuple<K, C>> Execute(int pid, IEnumerable<Tuple<K, C>> input)
{
return input.GroupBy(
kvp => kvp.Key,
kvp => kvp.Value,
(k, v) => new KeyValuePair<K, C>(k, v.Aggregate(mergeCombiners))
kvp => kvp.Item1,
kvp => kvp.Item2,
(k, v) => new Tuple<K, C>(k, v.Aggregate(mergeCombiners))
);
}
}
@ -908,12 +912,12 @@ namespace Microsoft.Spark.CSharp.Core
this.mergeValue = mergeValue;
}
public IEnumerable<KeyValuePair<K, C>> Execute(int pid, IEnumerable<KeyValuePair<K, V>> input)
public IEnumerable<Tuple<K, C>> Execute(int pid, IEnumerable<Tuple<K, V>> input)
{
return input.GroupBy(
kvp => kvp.Key,
kvp => kvp.Value,
(k, v) => new KeyValuePair<K, C>(k, v.Aggregate(createCombiner(), mergeValue))
kvp => kvp.Item1,
kvp => kvp.Item2,
(k, v) => new Tuple<K, C>(k, v.Aggregate(createCombiner(), mergeValue))
);
}
}
@ -932,7 +936,7 @@ namespace Microsoft.Spark.CSharp.Core
this.partitionFunc = partitionFunc;
}
public IEnumerable<byte[]> Execute(int split, IEnumerable<KeyValuePair<K, V>> input)
public IEnumerable<byte[]> Execute(int split, IEnumerable<Tuple<K, V>> input)
{
// make sure that md5 is not null even if it is deseriazed in C# worker
if (md5 == null)
@ -945,12 +949,12 @@ namespace Microsoft.Spark.CSharp.Core
var ms = new MemoryStream();
if (partitionFunc == null)
{
formatter.Serialize(ms, kv.Key);
formatter.Serialize(ms, kv.Item1);
yield return md5.ComputeHash(ms.ToArray()).Take(8).ToArray();
}
else
{
long pid = (long)(partitionFunc(kv.Key) % numPartitions);
long pid = (long)(partitionFunc(kv.Item1) % numPartitions);
yield return SerDe.ToBytes(pid);
}
ms = new MemoryStream();
@ -969,12 +973,12 @@ namespace Microsoft.Spark.CSharp.Core
func = f;
}
public KeyValuePair<K, U> Execute(KeyValuePair<K, V> kvp)
public Tuple<K, U> Execute(Tuple<K, V> kvp)
{
return new KeyValuePair<K, U>
return new Tuple<K, U>
(
kvp.Key,
func(kvp.Value)
kvp.Item1,
func(kvp.Item2)
);
}
}
@ -988,9 +992,9 @@ namespace Microsoft.Spark.CSharp.Core
func = f;
}
public IEnumerable<KeyValuePair<K, U>> Execute(KeyValuePair<K, V> kvp)
public IEnumerable<Tuple<K, U>> Execute(Tuple<K, V> kvp)
{
return func(kvp.Value).Select(v => new KeyValuePair<K, U>(kvp.Key, v));
return func(kvp.Item2).Select(v => new Tuple<K, U>(kvp.Item1, v));
}
}
[Serializable]
@ -1001,9 +1005,9 @@ namespace Microsoft.Spark.CSharp.Core
{
this.key = key;
}
internal bool Execute(KeyValuePair<K, V> input)
internal bool Execute(Tuple<K, V> input)
{
return input.Key.ToString() == key.ToString();
return input.Item1.ToString() == key.ToString();
}
}

Просмотреть файл

@ -51,6 +51,17 @@ namespace Microsoft.Spark.CSharp.Core
}
}
/// <summary>
/// Return the SparkContext that created this RDD
/// </summary>
public SparkContext SparkContext
{
get
{
return sparkContext;
}
}
/// <summary>
/// Return whether this RDD has been cached or not
/// </summary>
@ -189,7 +200,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <summary>
/// Return a new RDD by applying a function to each element of this RDD.
///
/// sc.Parallelize(new string[]{"b", "a", "c"}, 1).Map(x => new KeyValuePair&lt;string, int>(x, 1)).Collect()
/// sc.Parallelize(new string[]{"b", "a", "c"}, 1).Map(x => new Tuple&lt;string, int>(x, 1)).Collect()
/// [('a', 1), ('b', 1), ('c', 1)]
///
/// </summary>
@ -288,7 +299,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <returns></returns>
public RDD<T> Distinct(int numPartitions = 0)
{
return Map(x => new KeyValuePair<T, int>(x, 0)).ReduceByKey((x, y) => x, numPartitions).Map<T>(x => x.Key);
return Map(x => new Tuple<T, int>(x, 0)).ReduceByKey((x, y) => x, numPartitions).Map<T>(x => x.Item1);
}
/// <summary>
@ -461,9 +472,9 @@ namespace Microsoft.Spark.CSharp.Core
/// <returns></returns>
public RDD<T> Intersection(RDD<T> other)
{
return Map(v => new KeyValuePair<T, int>(v, 0))
.GroupWith(other.Map(v => new KeyValuePair<T, int>(v, 0)))
.Filter(kv => kv.Value.Item1.Count > 0 && kv.Value.Item2.Count > 0)
return Map(v => new Tuple<T, int>(v, 0))
.GroupWith(other.Map(v => new Tuple<T, int>(v, 0)))
.Filter(kv => kv.Item2.Item1.Count > 0 && kv.Item2.Item2.Count > 0)
.Keys();
}
@ -533,7 +544,7 @@ namespace Microsoft.Spark.CSharp.Core
///
/// </summary>
/// <returns></returns>
public RDD<KeyValuePair<K, List<T>>> GroupBy<K>(Func<T, K> f, int numPartitions = 0)
public RDD<Tuple<K, List<T>>> GroupBy<K>(Func<T, K> f, int numPartitions = 0)
{
return KeyBy(f).GroupByKey(numPartitions);
}
@ -639,14 +650,14 @@ namespace Microsoft.Spark.CSharp.Core
if (depth < 1)
throw new ArgumentException(string.Format("Depth cannot be smaller than 1 but got {0}.", depth));
var zeroValue = new KeyValuePair<T, bool>(default(T), true); // Use the second entry to indicate whether this is a dummy value.
var zeroValue = new Tuple<T, bool>(default(T), true); // Use the second entry to indicate whether this is a dummy value.
Func<KeyValuePair<T, bool>, KeyValuePair<T, bool>, KeyValuePair<T, bool>> op = new TreeReduceHelper<T>(f).Execute;
Func<Tuple<T, bool>, Tuple<T, bool>, Tuple<T, bool>> op = new TreeReduceHelper<T>(f).Execute;
var reduced = Map<KeyValuePair<T, bool>>(x => new KeyValuePair<T, bool>(x, false)).TreeAggregate(zeroValue, op, op, depth);
if (reduced.Value)
var reduced = Map<Tuple<T, bool>>(x => new Tuple<T, bool>(x, false)).TreeAggregate(zeroValue, op, op, depth);
if (reduced.Item2)
throw new ArgumentException("Cannot reduce empty RDD.");
return reduced.Key;
return reduced.Item1;
}
/// <summary>
@ -736,7 +747,7 @@ namespace Microsoft.Spark.CSharp.Core
numPartitions /= scale;
partiallyAggregated = partiallyAggregated
.MapPartitionsWithIndex<KeyValuePair<int, U>>(new TreeAggregateHelper<U>(numPartitions).Execute)
.MapPartitionsWithIndex<Tuple<int, U>>(new TreeAggregateHelper<U>(numPartitions).Execute)
.ReduceByKey(combOp, numPartitions)
.Values();
}
@ -762,9 +773,9 @@ namespace Microsoft.Spark.CSharp.Core
///
/// </summary>
/// <returns></returns>
public Dictionary<T, long> CountByValue()
public IEnumerable<Tuple<T, long>> CountByValue()
{
return Map<KeyValuePair<T, T>>(v => new KeyValuePair<T, T>(v, default(T))).CountByKey();
return Map(v => new Tuple<T, T>(v, default(T))).CountByKey();
}
/// <summary>
@ -872,9 +883,9 @@ namespace Microsoft.Spark.CSharp.Core
/// <returns></returns>
public RDD<T> Subtract(RDD<T> other, int numPartitions = 0)
{
return Map<KeyValuePair<T, T>>(v => new KeyValuePair<T, T>(v, default(T))).SubtractByKey
return Map<Tuple<T, T>>(v => new Tuple<T, T>(v, default(T))).SubtractByKey
(
other.Map<KeyValuePair<T, T>>(v => new KeyValuePair<T, T>(v, default(T))),
other.Map<Tuple<T, T>>(v => new Tuple<T, T>(v, default(T))),
numPartitions
)
.Keys();
@ -890,9 +901,9 @@ namespace Microsoft.Spark.CSharp.Core
/// <typeparam name="K"></typeparam>
/// <param name="f"></param>
/// <returns></returns>
public RDD<KeyValuePair<K, T>> KeyBy<K>(Func<T, K> f)
public RDD<Tuple<K, T>> KeyBy<K>(Func<T, K> f)
{
return Map<KeyValuePair<K, T>>(new KeyByHelper<K, T>(f).Execute);
return Map<Tuple<K, T>>(new KeyByHelper<K, T>(f).Execute);
}
/// <summary>
@ -950,9 +961,9 @@ namespace Microsoft.Spark.CSharp.Core
/// <typeparam name="U"></typeparam>
/// <param name="other"></param>
/// <returns></returns>
public RDD<KeyValuePair<T, U>> Zip<U>(RDD<U> other)
public RDD<Tuple<T, U>> Zip<U>(RDD<U> other)
{
return new RDD<KeyValuePair<T, U>>(RddProxy.Zip(other.RddProxy), sparkContext, SerializedMode.Pair);
return new RDD<Tuple<T, U>>(RddProxy.Zip(other.RddProxy), sparkContext, SerializedMode.Pair);
}
/// <summary>
@ -971,7 +982,7 @@ namespace Microsoft.Spark.CSharp.Core
///
/// </summary>
/// <returns></returns>
public RDD<KeyValuePair<T, long>> ZipWithIndex()
public RDD<Tuple<T, long>> ZipWithIndex()
{
int num = GetNumPartitions();
int[] starts = new int[num];
@ -981,7 +992,7 @@ namespace Microsoft.Spark.CSharp.Core
for (int i = 0; i < nums.Length - 1; i++)
starts[i + 1] = starts[i] + nums[i];
}
return MapPartitionsWithIndex<KeyValuePair<T, long>>(new ZipWithIndexHelper<T>(starts).Execute);
return MapPartitionsWithIndex<Tuple<T, long>>(new ZipWithIndexHelper<T>(starts).Execute);
}
/// <summary>
@ -996,10 +1007,10 @@ namespace Microsoft.Spark.CSharp.Core
///
/// </summary>
/// <returns></returns>
public RDD<KeyValuePair<T, long>> ZipWithUniqueId()
public RDD<Tuple<T, long>> ZipWithUniqueId()
{
int num = GetNumPartitions();
return MapPartitionsWithIndex<KeyValuePair<T, long>>(new ZipWithUniqueIdHelper<T>(num).Execute);
return MapPartitionsWithIndex<Tuple<T, long>>(new ZipWithUniqueIdHelper<T>(num).Execute);
}
/// <summary>
@ -1225,27 +1236,27 @@ namespace Microsoft.Spark.CSharp.Core
{
K key;
dynamic value;
if (x is KeyValuePair<K, V>)
if (x is Tuple<K, V>)
{
key = ((KeyValuePair<K, V>)x).Key;
value = ((KeyValuePair<K, V>)x).Value;
key = ((Tuple<K, V>)x).Item1;
value = ((Tuple<K, V>)x).Item2;
}
else if (x is KeyValuePair<K, W1>)
else if (x is Tuple<K, W1>)
{
key = ((KeyValuePair<K, W1>)x).Key;
value = ((KeyValuePair<K, W1>)x).Value;
key = ((Tuple<K, W1>)x).Item1;
value = ((Tuple<K, W1>)x).Item2;
}
else if (x is KeyValuePair<K, W2>)
else if (x is Tuple<K, W2>)
{
key = ((KeyValuePair<K, W2>)x).Key;
value = ((KeyValuePair<K, W2>)x).Value;
key = ((Tuple<K, W2>)x).Item1;
value = ((Tuple<K, W2>)x).Item2;
}
else
{
key = ((KeyValuePair<K, W3>)x).Key;
value = ((KeyValuePair<K, W3>)x).Value;
key = ((Tuple<K, W3>)x).Item1;
value = ((Tuple<K, W3>)x).Item2;
}
return new KeyValuePair<K, dynamic>(key, value);
return new Tuple<K, dynamic>(key, value);
})
.Cast<dynamic>();
}
@ -1405,9 +1416,9 @@ namespace Microsoft.Spark.CSharp.Core
func = f;
}
internal KeyValuePair<K, T> Execute(T input)
internal Tuple<K, T> Execute(T input)
{
return new KeyValuePair<K, T>(func(input), input);
return new Tuple<K, T>(func(input), input);
}
}
[Serializable]
@ -1434,9 +1445,9 @@ namespace Microsoft.Spark.CSharp.Core
{
this.numPartitions = numPartitions;
}
internal IEnumerable<KeyValuePair<int, U>> Execute(int pid, IEnumerable<U> input)
internal IEnumerable<Tuple<int, U>> Execute(int pid, IEnumerable<U> input)
{
return input.Select(x => new KeyValuePair<int, U>(pid % numPartitions, x));
return input.Select(x => new Tuple<int, U>(pid % numPartitions, x));
}
}
[Serializable]
@ -1447,14 +1458,14 @@ namespace Microsoft.Spark.CSharp.Core
{
this.func = func;
}
internal KeyValuePair<T, bool> Execute(KeyValuePair<T, bool> x, KeyValuePair<T, bool> y)
internal Tuple<T, bool> Execute(Tuple<T, bool> x, Tuple<T, bool> y)
{
if (x.Value)
if (x.Item2)
return y;
else if (y.Value)
else if (y.Item2)
return x;
else
return new KeyValuePair<T, bool>(func(x.Key, y.Key), false);
return new Tuple<T, bool>(func(x.Item1, y.Item1), false);
}
}
[Serializable]
@ -1539,12 +1550,12 @@ namespace Microsoft.Spark.CSharp.Core
{
this.num = num;
}
internal IEnumerable<KeyValuePair<T, long>> Execute(int pid, IEnumerable<T> input)
internal IEnumerable<Tuple<T, long>> Execute(int pid, IEnumerable<T> input)
{
long l = 0;
foreach (var item in input)
{
yield return new KeyValuePair<T, long>(item, (l++) * num + pid);
yield return new Tuple<T, long>(item, (l++) * num + pid);
}
}
}
@ -1556,12 +1567,12 @@ namespace Microsoft.Spark.CSharp.Core
{
this.starts = starts;
}
internal IEnumerable<KeyValuePair<T, long>> Execute(int pid, IEnumerable<T> input)
internal IEnumerable<Tuple<T, long>> Execute(int pid, IEnumerable<T> input)
{
long l = 0;
foreach (var item in input)
{
yield return new KeyValuePair<T, long>(item, (l++) + starts[pid]);
yield return new Tuple<T, long>(item, (l++) + starts[pid]);
}
}
}

Просмотреть файл

@ -241,7 +241,7 @@ namespace Microsoft.Spark.CSharp.Core
///
/// Do
/// {{{
/// RDD&lt;KeyValuePair&lt;string, string>> rdd = sparkContext.WholeTextFiles("hdfs://a-hdfs-path")
/// RDD&lt;Tuple&lt;string, string>> rdd = sparkContext.WholeTextFiles("hdfs://a-hdfs-path")
/// }}}
///
/// then `rdd` contains
@ -259,9 +259,9 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="filePath"></param>
/// <param name="minPartitions"></param>
/// <returns></returns>
public RDD<KeyValuePair<byte[], byte[]>> WholeTextFiles(string filePath, int? minPartitions = null)
public RDD<Tuple<byte[], byte[]>> WholeTextFiles(string filePath, int? minPartitions = null)
{
return new RDD<KeyValuePair<byte[], byte[]>>(SparkContextProxy.WholeTextFiles(filePath, minPartitions ?? DefaultMinPartitions), this, SerializedMode.Pair);
return new RDD<Tuple<byte[], byte[]>>(SparkContextProxy.WholeTextFiles(filePath, minPartitions ?? DefaultMinPartitions), this, SerializedMode.Pair);
}
/// <summary>
@ -279,7 +279,7 @@ namespace Microsoft.Spark.CSharp.Core
/// }}}
///
/// Do
/// RDD&lt;KeyValuePair&lt;string, byte[]>>"/> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`,
/// RDD&lt;Tuple&lt;string, byte[]>>"/> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`,
///
/// then `rdd` contains
/// {{{
@ -296,9 +296,9 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="filePath"></param>
/// <param name="minPartitions"></param>
/// <returns></returns>
public RDD<KeyValuePair<byte[], byte[]>> BinaryFiles(string filePath, int? minPartitions)
public RDD<Tuple<byte[], byte[]>> BinaryFiles(string filePath, int? minPartitions)
{
return new RDD<KeyValuePair<byte[], byte[]>>(SparkContextProxy.BinaryFiles(filePath, minPartitions ?? DefaultMinPartitions), this, SerializedMode.Pair);
return new RDD<Tuple<byte[], byte[]>>(SparkContextProxy.BinaryFiles(filePath, minPartitions ?? DefaultMinPartitions), this, SerializedMode.Pair);
}
/// <summary>
@ -341,7 +341,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="valueConverterClass">(None by default)</param>
/// <param name="conf"> Hadoop configuration, passed in as a dict (None by default)</param>
/// <returns></returns>
public RDD<byte[]> NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass = null, string valueConverterClass = null, IEnumerable<KeyValuePair<string, string>> conf = null)
public RDD<byte[]> NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass = null, string valueConverterClass = null, IEnumerable<Tuple<string, string>> conf = null)
{
return new RDD<byte[]>(SparkContextProxy.NewAPIHadoopFile(filePath, inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, conf, 1), this, SerializedMode.None);
}
@ -360,7 +360,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="valueConverterClass">(None by default)</param>
/// <param name="conf">Hadoop configuration, passed in as a dict (None by default)</param>
/// <returns></returns>
public RDD<byte[]> NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass = null, string valueConverterClass = null, IEnumerable<KeyValuePair<string, string>> conf = null)
public RDD<byte[]> NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass = null, string valueConverterClass = null, IEnumerable<Tuple<string, string>> conf = null)
{
return new RDD<byte[]>(SparkContextProxy.NewAPIHadoopRDD(inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, conf, 1), this, SerializedMode.None);
}
@ -381,7 +381,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="valueConverterClass">(None by default)</param>
/// <param name="conf">Hadoop configuration, passed in as a dict (None by default)</param>
/// <returns></returns>
public RDD<byte[]> HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass = null, string valueConverterClass = null, IEnumerable<KeyValuePair<string, string>> conf = null)
public RDD<byte[]> HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass = null, string valueConverterClass = null, IEnumerable<Tuple<string, string>> conf = null)
{
return new RDD<byte[]>(SparkContextProxy.HadoopFile(filePath, inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, conf, 1), this, SerializedMode.None);
}
@ -400,7 +400,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="valueConverterClass">(None by default)</param>
/// <param name="conf">Hadoop configuration, passed in as a dict (None by default)</param>
/// <returns></returns>
public RDD<byte[]> HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass = null, string valueConverterClass = null, IEnumerable<KeyValuePair<string, string>> conf = null)
public RDD<byte[]> HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass = null, string valueConverterClass = null, IEnumerable<Tuple<string, string>> conf = null)
{
return new RDD<byte[]>(SparkContextProxy.HadoopRDD(inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, conf, 1), this, SerializedMode.None);
}
@ -571,6 +571,17 @@ namespace Microsoft.Spark.CSharp.Core
SparkContextProxy.SetLogLevel(logLevel);
}
/// <summary>
/// Run a job on a given set of partitions of an RDD.
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="rdd"></param>
/// <param name="partitions"></param>
public void RunJob<T>(RDD<T> rdd, IEnumerable<int> partitions)
{
SparkContextProxy.RunJob(rdd.RddProxy, partitions);
}
/// <summary>
/// Cancel active jobs for the specified group. See <see cref="SetJobGroup"/> for more information.
/// </summary>

Просмотреть файл

@ -7,6 +7,7 @@ using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Microsoft.Spark.CSharp.Proxy.Ipc;
using System;
namespace Microsoft.Spark.CSharp.Interop.Ipc
{
@ -16,31 +17,31 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
[ExcludeFromCodeCoverage] //IPC calls to JVM validated using validation-enabled samples - unit test coverage not reqiured
internal static class JvmBridgeUtils
{
public static JvmObjectReference GetJavaMap<K, V>(IEnumerable<KeyValuePair<K, V>> enumerable)
public static JvmObjectReference GetJavaMap<K, V>(IEnumerable<Tuple<K, V>> enumerable)
{
var jmap = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.Hashtable", new object[] { });
if (enumerable != null)
{
foreach (var item in enumerable)
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jmap, "put", new object[] { item.Key, item.Value });
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jmap, "put", new object[] { item.Item1, item.Item2 });
}
return jmap;
}
public static JvmObjectReference GetJavaHashMap<K, V>(IEnumerable<KeyValuePair<K, V>> enumerable)
public static JvmObjectReference GetJavaHashMap<K, V>(IEnumerable<Tuple<K, V>> enumerable)
{
var jmap = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.HashMap", new object[] { });
if (enumerable != null)
{
foreach (var item in enumerable)
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jmap, "put", new object[] { item.Key, item.Value });
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jmap, "put", new object[] { item.Item1, item.Item2 });
}
return jmap;
}
public static JvmObjectReference GetScalaMutableMap<K, V>(Dictionary<K, V> mapValues)
public static JvmObjectReference GetScalaMutableMap<K, V>(IEnumerable<Tuple<K, V>> mapValues)
{
var hashMapReference = GetJavaHashMap(mapValues.Select(kvp => kvp));
var hashMapReference = GetJavaHashMap(mapValues);
return new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.JvmBridgeUtils", "toMutableMap", new object[] { hashMapReference }).ToString());
}

Просмотреть файл

@ -69,8 +69,8 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
int previousReferencesCountBenchmark = referencesCountBenchmark;
checkCount *= 2;
referencesCountBenchmark = referencesCountBenchmark + referencesCountBenchmark / 2;
logger.LogDebug("Adjust checkCount from {0} to {1}, referencesCountBenchmark from {2} to {3}",
previousCheckCount, checkCount, previousReferencesCountBenchmark, referencesCountBenchmark);
//logger.LogDebug("Adjust checkCount from {0} to {1}, referencesCountBenchmark from {2} to {3}",
// previousCheckCount, checkCount, previousReferencesCountBenchmark, referencesCountBenchmark);
}
return checkCount;
}
@ -134,14 +134,14 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
private void RunReleaseObjectLoop()
{
logger.LogDebug("Checking objects thread start ...");
//logger.LogDebug("Checking objects thread start ...");
while (shouldKeepRunning)
{
ReleseGarbageCollectedObjects();
Thread.Sleep(CheckInterval);
}
logger.LogDebug("Checking objects thread stopped.");
//logger.LogDebug("Checking objects thread stopped.");
}
~WeakObjectManagerImpl()
@ -165,13 +165,13 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
int referencesCount = weakReferences.Count;
if (referencesCount == 0)
{
logger.LogDebug("check begin : quit as weakReferences.Count = 0");
//logger.LogDebug("check begin : quit as weakReferences.Count = 0");
return;
}
var beginTime = DateTime.Now;
int checkCount = checkCountController.AdjustCheckCount(referencesCount);
logger.LogDebug("check begin : weakReferences.Count = {0}, checkCount: {1}", referencesCount, checkCount);
//logger.LogDebug("check begin : weakReferences.Count = {0}, checkCount: {1}", referencesCount, checkCount);
int garbageCount;
var aliveList = ReleseGarbageCollectedObjects(checkCount, out garbageCount);
@ -179,11 +179,11 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
aliveList.ForEach(item => weakReferences.Enqueue(item));
var timeStoreAlive = DateTime.Now;
logger.LogDebug("check end : released {0} garbage, remain {1} alive, used {2} ms : release garbage used {3} ms, store alive used {4} ms",
garbageCount, weakReferences.Count, (DateTime.Now - beginTime).TotalMilliseconds,
(timeReleaseGarbage - beginTime).TotalMilliseconds,
(timeStoreAlive - timeReleaseGarbage).TotalMilliseconds
);
//logger.LogDebug("check end : released {0} garbage, remain {1} alive, used {2} ms : release garbage used {3} ms, store alive used {4} ms",
// garbageCount, weakReferences.Count, (DateTime.Now - beginTime).TotalMilliseconds,
// (timeReleaseGarbage - beginTime).TotalMilliseconds,
// (timeStoreAlive - timeReleaseGarbage).TotalMilliseconds
// );
}
private List<WeakReferenceObjectIdPair> ReleseGarbageCollectedObjects(int checkCount, out int garbageCount)
@ -208,7 +208,7 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
i++;
if (i >= checkCount)
{
logger.LogDebug("Stop releasing as exceeded allowed checkCount: {0}", checkCount);
//logger.LogDebug("Stop releasing as exceeded allowed checkCount: {0}", checkCount);
break;
}
}
@ -238,7 +238,7 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
public virtual void Dispose()
{
logger.LogInfo("Dispose {0}", this.GetType());
//logger.LogInfo("Dispose {0}", this.GetType());
shouldKeepRunning = false;
}
}

Просмотреть файл

@ -31,13 +31,13 @@ namespace Microsoft.Spark.CSharp.Proxy
string Name { get; }
void SetName(string name);
IRDDProxy RandomSampleWithRange(double lb, double ub, long seed);
IRDDProxy SampleByKey(bool withReplacement, Dictionary<string, double> fractions, long seed);
IRDDProxy SampleByKey(bool withReplacement, IEnumerable<Tuple<string, double>> fractions, long seed);
IRDDProxy Zip(IRDDProxy other);
string ToDebugString();
void SaveAsNewAPIHadoopDataset(IEnumerable<KeyValuePair<string, string>> conf);
void SaveAsNewAPIHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf);
void SaveAsHadoopDataset(IEnumerable<KeyValuePair<string, string>> conf);
void SaveAsHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf, string compressionCodecClass);
void SaveAsNewAPIHadoopDataset(IEnumerable<Tuple<string, string>> conf);
void SaveAsNewAPIHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<Tuple<string, string>> conf);
void SaveAsHadoopDataset(IEnumerable<Tuple<string, string>> conf);
void SaveAsHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<Tuple<string, string>> conf, string compressionCodecClass);
void SaveAsSequenceFile(string path, string compressionCodecClass);
void SaveAsTextFile(string path, string compressionCodecClass);
long Count();

Просмотреть файл

@ -35,10 +35,10 @@ namespace Microsoft.Spark.CSharp.Proxy
IRDDProxy WholeTextFiles(string filePath, int minPartitions);
IRDDProxy BinaryFiles(string filePath, int minPartitions);
IRDDProxy SequenceFile(string filePath, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, int minSplits, int batchSize);
IRDDProxy NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize);
IRDDProxy NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize);
IRDDProxy HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize);
IRDDProxy HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize);
IRDDProxy NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize);
IRDDProxy NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize);
IRDDProxy HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize);
IRDDProxy HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize);
IRDDProxy CheckpointFile(string filePath);
IRDDProxy Union(IEnumerable<IRDDProxy> rdds);
void AddFile(string path);

Просмотреть файл

@ -10,12 +10,15 @@ using Microsoft.Spark.CSharp.Sql;
namespace Microsoft.Spark.CSharp.Proxy
{
internal interface IUdfRegistration { }
internal interface IUdfRegistrationProxy
{
void RegisterFunction(string name, byte[] command, string returnType);
}
interface ISparkSessionProxy
{
ISqlContextProxy SqlContextProxy { get; }
IUdfRegistration Udf { get; }
IUdfRegistrationProxy Udf { get; }
ICatalogProxy GetCatalog();
IDataFrameReaderProxy Read();
ISparkSessionProxy NewSession();

Просмотреть файл

@ -3,9 +3,6 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Microsoft.Spark.CSharp.Core;
@ -20,10 +17,9 @@ namespace Microsoft.Spark.CSharp.Proxy
void Checkpoint(string directory);
IDStreamProxy TextFileStream(string directory);
IDStreamProxy SocketTextStream(string hostname, int port, StorageLevelType storageLevelType);
IDStreamProxy KafkaStream(Dictionary<string, int> topics, Dictionary<string, string> kafkaParams, StorageLevelType storageLevelType);
IDStreamProxy DirectKafkaStream(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets);
IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets,
int numPartitions, byte[] readFunc, string serializationMode);
IDStreamProxy KafkaStream(IEnumerable<Tuple<string, int>> topics, IEnumerable<Tuple<string, string>> kafkaParams, StorageLevelType storageLevelType);
IDStreamProxy DirectKafkaStream(List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams, IEnumerable<Tuple<string, long>> fromOffsets);
IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams, IEnumerable<Tuple<string, long>> fromOffsets, int numPartitions, byte[] readFunc, string serializationMode);
IDStreamProxy Union(IDStreamProxy firstDStreams, IDStreamProxy[] otherDStreams);
void AwaitTermination();
void AwaitTerminationOrTimeout(long timeout);
@ -33,7 +29,6 @@ namespace Microsoft.Spark.CSharp.Proxy
IDStreamProxy CreateCSharpStateDStream(IDStreamProxy jdstream, byte[] func, string className, string serializationMode, string serializationMode2);
IDStreamProxy CreateConstantInputDStream(IRDDProxy rddProxy);
IDStreamProxy CreateCSharpInputDStream(byte[] func, string serializationMode);
IDStreamProxy EventHubsUnionStream(Dictionary<string, string> eventHubsParams, StorageLevelType storageLevelType);
IDStreamProxy EventHubsUnionStream(IEnumerable<Tuple<string, string>> eventHubsParams, StorageLevelType storageLevelType);
}
}

Просмотреть файл

@ -158,7 +158,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "setName", new object[] { name });
}
public IRDDProxy SampleByKey(bool withReplacement, Dictionary<string, double> fractions, long seed)
public IRDDProxy SampleByKey(bool withReplacement, IEnumerable<Tuple<string, double>> fractions, long seed)
{
var jfractions = JvmBridgeUtils.GetJavaMap(fractions) as JvmObjectReference;
return new RDDIpcProxy(new JvmObjectReference((string) SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "sampleByKey", new object[] { withReplacement, jfractions, seed })));
@ -176,25 +176,25 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
return new RDDIpcProxy(new JvmObjectReference((string) SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "zip", new object[] { (other as RDDIpcProxy).jvmRddReference })));
}
public void SaveAsNewAPIHadoopDataset(IEnumerable<KeyValuePair<string, string>> conf)
public void SaveAsNewAPIHadoopDataset(IEnumerable<Tuple<string, string>> conf)
{
var jconf = JvmBridgeUtils.GetJavaMap<string, string>(conf);
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "saveAsHadoopDataset", new object[] { jvmRddReference, false, jconf, null, null, true });
}
public void SaveAsNewAPIHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf)
public void SaveAsNewAPIHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<Tuple<string, string>> conf)
{
var jconf = JvmBridgeUtils.GetJavaMap<string, string>(conf);
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "saveAsNewAPIHadoopFile", new object[] { jvmRddReference, false, path, outputFormatClass, keyClass, valueClass, null, null, jconf });
}
public void SaveAsHadoopDataset(IEnumerable<KeyValuePair<string, string>> conf)
public void SaveAsHadoopDataset(IEnumerable<Tuple<string, string>> conf)
{
var jconf = JvmBridgeUtils.GetJavaMap<string, string>(conf);
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "saveAsHadoopDataset", new object[] { jvmRddReference, false, jconf, null, null, false });
}
public void SaveAsHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf, string compressionCodecClass)
public void SaveAsHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<Tuple<string, string>> conf, string compressionCodecClass)
{
var jconf = JvmBridgeUtils.GetJavaMap<string, string>(conf);
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "saveAsHadoopFile", new object[] { jvmRddReference, false, path, outputFormatClass, keyClass, valueClass, null, null, jconf, compressionCodecClass });

Просмотреть файл

@ -183,7 +183,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
return new RDDIpcProxy(jvmRddReference);
}
public IRDDProxy NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
public IRDDProxy NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize)
{
var jconf = JvmBridgeUtils.GetJavaHashMap<string, string>(conf);
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "newAPIHadoopFile",
@ -191,7 +191,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
return new RDDIpcProxy(jvmRddReference);
}
public IRDDProxy NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
public IRDDProxy NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize)
{
var jconf = JvmBridgeUtils.GetJavaHashMap<string, string>(conf);
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "newAPIHadoopRDD",
@ -199,7 +199,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
return new RDDIpcProxy(jvmRddReference);
}
public IRDDProxy HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
public IRDDProxy HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize)
{
var jconf = JvmBridgeUtils.GetJavaHashMap<string, string>(conf);
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "hadoopFile",
@ -207,7 +207,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
return new RDDIpcProxy(jvmRddReference);
}
public IRDDProxy HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
public IRDDProxy HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize)
{
var jconf = JvmBridgeUtils.GetJavaHashMap<string, string>(conf);
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "hadoopRDD",

Просмотреть файл

@ -17,18 +17,13 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
private readonly JvmObjectReference jvmSparkSessionReference;
private readonly ISqlContextProxy sqlContextProxy;
private readonly IUdfRegistration udfRegistration;
private readonly IUdfRegistrationProxy udfRegistrationProxy;
public IUdfRegistration Udf
public IUdfRegistrationProxy Udf
{
get
{
if (udfRegistration == null)
{
//TODO implementation needed
}
return udfRegistration;
return udfRegistrationProxy;
}
}
@ -46,6 +41,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
{
this.jvmSparkSessionReference = jvmSparkSessionReference;
sqlContextProxy = new SqlContextIpcProxy(GetSqlContextReference());
udfRegistrationProxy = new UdfRegistrationIpcProxy(sqlContextProxy);
}
private JvmObjectReference GetSqlContextReference()
@ -98,4 +94,19 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSparkSessionReference, "stop");
}
}
[ExcludeFromCodeCoverage] //IPC calls to JVM validated using validation-enabled samples - unit test coverage not reqiured
internal class UdfRegistrationIpcProxy : IUdfRegistrationProxy
{
private readonly ISqlContextProxy sqlContextProxy;
internal UdfRegistrationIpcProxy(ISqlContextProxy sqlContextProxy)
{
this.sqlContextProxy = sqlContextProxy;
}
public void RegisterFunction(string name, byte[] command, string returnType)
{
sqlContextProxy.RegisterFunction(name, command, returnType);
}
}
}

Просмотреть файл

@ -197,7 +197,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
return new DStreamIpcProxy(jstream);
}
public IDStreamProxy KafkaStream(Dictionary<string, int> topics, Dictionary<string, string> kafkaParams, StorageLevelType storageLevelType)
public IDStreamProxy KafkaStream(IEnumerable<Tuple<string, int>> topics, IEnumerable<Tuple<string, string>> kafkaParams, StorageLevelType storageLevelType)
{
JvmObjectReference jtopics = JvmBridgeUtils.GetJavaMap<string, int>(topics);
JvmObjectReference jkafkaParams = JvmBridgeUtils.GetJavaMap<string, string>(kafkaParams);
@ -208,16 +208,16 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
return new DStreamIpcProxy(jstream);
}
public IDStreamProxy DirectKafkaStream(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets)
public IDStreamProxy DirectKafkaStream(List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams, IEnumerable<Tuple<string, long>> fromOffsets)
{
JvmObjectReference jtopics = JvmBridgeUtils.GetJavaSet<string>(topics);
JvmObjectReference jkafkaParams = JvmBridgeUtils.GetJavaMap<string, string>(kafkaParams);
var jTopicAndPartitions = fromOffsets.Select(x =>
new KeyValuePair<JvmObjectReference, long>
new Tuple<JvmObjectReference, long>
(
SparkCLRIpcProxy.JvmBridge.CallConstructor("kafka.common.TopicAndPartition", new object[] { x.Key.Split(':')[0], int.Parse(x.Key.Split(':')[1]) }),
x.Value
SparkCLRIpcProxy.JvmBridge.CallConstructor("kafka.common.TopicAndPartition", new object[] { x.Item1.Split(':')[0], int.Parse(x.Item1.Split(':')[1]) }),
x.Item2
)
);
@ -228,17 +228,16 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
return new DStreamIpcProxy(jstream);
}
public IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, Dictionary<string, string> kafkaParams,
Dictionary<string, long> fromOffsets, int numPartitions, byte[] readFunc, string serializationMode)
public IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams, IEnumerable<Tuple<string, long>> fromOffsets, int numPartitions, byte[] readFunc, string serializationMode)
{
JvmObjectReference jtopics = JvmBridgeUtils.GetJavaSet<string>(topics);
JvmObjectReference jkafkaParams = JvmBridgeUtils.GetJavaMap<string, string>(kafkaParams);
var jTopicAndPartitions = fromOffsets.Select(x =>
new KeyValuePair<JvmObjectReference, long>
new Tuple<JvmObjectReference, long>
(
SparkCLRIpcProxy.JvmBridge.CallConstructor("kafka.common.TopicAndPartition", new object[] { x.Key.Split(':')[0], int.Parse(x.Key.Split(':')[1]) }),
x.Value
SparkCLRIpcProxy.JvmBridge.CallConstructor("kafka.common.TopicAndPartition", new object[] { x.Item1.Split(':')[0], int.Parse(x.Item1.Split(':')[1]) }),
x.Item2
)
);
@ -250,7 +249,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
return new DStreamIpcProxy(jstream);
}
public IDStreamProxy EventHubsUnionStream(Dictionary<string, string> eventHubsParams, StorageLevelType storageLevelType)
public IDStreamProxy EventHubsUnionStream(IEnumerable<Tuple<string, string>> eventHubsParams, StorageLevelType storageLevelType)
{
JvmObjectReference eventHubsParamsReference = JvmBridgeUtils.GetScalaMutableMap<string, string>(eventHubsParams);
JvmObjectReference storageLevelTypeReference = SparkContextIpcProxy.GetJavaStorageLevel(storageLevelType);

Просмотреть файл

@ -48,6 +48,11 @@ namespace Microsoft.Spark.CSharp.Sql
get { return sparkContext; }
}
public UdfRegistration Udf
{
get { return new UdfRegistration(sparkSessionProxy.Udf); }
}
/// <summary>
/// Builder for SparkSession
/// </summary>

Просмотреть файл

@ -0,0 +1,254 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Proxy;
using Microsoft.Spark.CSharp.Services;
namespace Microsoft.Spark.CSharp.Sql
{
/// <summary>
/// Used for registering User Defined Functions. SparkSession.Udf is used to access instance of this type.
/// </summary>
public class UdfRegistration
{
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(UdfRegistration));
private IUdfRegistrationProxy udfRegistrationProxy;
internal UdfRegistration(IUdfRegistrationProxy udfRegistrationProxy)
{
this.udfRegistrationProxy = udfRegistrationProxy;
}
//TODO - the following section is a copy of the same functionality in SQLContext..refactoring needed
#region UDF Registration
/// <summary>
/// Register UDF with no input argument, e.g:
/// SqlContext.RegisterFunction&lt;bool>("MyFilter", () => true);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter()");
/// </summary>
/// <typeparam name="RT"></typeparam>
/// <param name="name"></param>
/// <param name="f"></param>
public void RegisterFunction<RT>(string name, Func<RT> f)
{
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT>(f).Execute;
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
}
/// <summary>
/// Register UDF with 1 input argument, e.g:
/// SqlContext.RegisterFunction&lt;bool, string>("MyFilter", (arg1) => arg1 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1)");
/// </summary>
/// <typeparam name="RT"></typeparam>
/// <typeparam name="A1"></typeparam>
/// <param name="name"></param>
/// <param name="f"></param>
public void RegisterFunction<RT, A1>(string name, Func<A1, RT> f)
{
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1>(f).Execute;
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
}
/// <summary>
/// Register UDF with 2 input arguments, e.g:
/// SqlContext.RegisterFunction&lt;bool, string, string>("MyFilter", (arg1, arg2) => arg1 != null &amp;&amp; arg2 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2)");
/// </summary>
/// <typeparam name="RT"></typeparam>
/// <typeparam name="A1"></typeparam>
/// <typeparam name="A2"></typeparam>
/// <param name="name"></param>
/// <param name="f"></param>
public void RegisterFunction<RT, A1, A2>(string name, Func<A1, A2, RT> f)
{
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2>(f).Execute;
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
}
/// <summary>
/// Register UDF with 3 input arguments, e.g:
/// SqlContext.RegisterFunction&lt;bool, string, string, string>("MyFilter", (arg1, arg2, arg3) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; arg3 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, columnName3)");
/// </summary>
/// <typeparam name="RT"></typeparam>
/// <typeparam name="A1"></typeparam>
/// <typeparam name="A2"></typeparam>
/// <typeparam name="A3"></typeparam>
/// <param name="name"></param>
/// <param name="f"></param>
public void RegisterFunction<RT, A1, A2, A3>(string name, Func<A1, A2, A3, RT> f)
{
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2, A3>(f).Execute;
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
}
/// <summary>
/// Register UDF with 4 input arguments, e.g:
/// SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg4) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg3 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName4)");
/// </summary>
/// <typeparam name="RT"></typeparam>
/// <typeparam name="A1"></typeparam>
/// <typeparam name="A2"></typeparam>
/// <typeparam name="A3"></typeparam>
/// <typeparam name="A4"></typeparam>
/// <param name="name"></param>
/// <param name="f"></param>
public void RegisterFunction<RT, A1, A2, A3, A4>(string name, Func<A1, A2, A3, A4, RT> f)
{
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2, A3, A4>(f).Execute;
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
}
/// <summary>
/// Register UDF with 5 input arguments, e.g:
/// SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg5) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg5 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName5)");
/// </summary>
/// <typeparam name="RT"></typeparam>
/// <typeparam name="A1"></typeparam>
/// <typeparam name="A2"></typeparam>
/// <typeparam name="A3"></typeparam>
/// <typeparam name="A4"></typeparam>
/// <typeparam name="A5"></typeparam>
/// <param name="name"></param>
/// <param name="f"></param>
public void RegisterFunction<RT, A1, A2, A3, A4, A5>(string name, Func<A1, A2, A3, A4, A5, RT> f)
{
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2, A3, A4, A5>(f).Execute;
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
}
/// <summary>
/// Register UDF with 6 input arguments, e.g:
/// SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg6) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg6 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName6)");
/// </summary>
/// <typeparam name="RT"></typeparam>
/// <typeparam name="A1"></typeparam>
/// <typeparam name="A2"></typeparam>
/// <typeparam name="A3"></typeparam>
/// <typeparam name="A4"></typeparam>
/// <typeparam name="A5"></typeparam>
/// <typeparam name="A6"></typeparam>
/// <param name="name"></param>
/// <param name="f"></param>
public void RegisterFunction<RT, A1, A2, A3, A4, A5, A6>(string name, Func<A1, A2, A3, A4, A5, A6, RT> f)
{
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2, A3, A4, A5, A6>(f).Execute;
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
}
/// <summary>
/// Register UDF with 7 input arguments, e.g:
/// SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg7) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg7 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName7)");
/// </summary>
/// <typeparam name="RT"></typeparam>
/// <typeparam name="A1"></typeparam>
/// <typeparam name="A2"></typeparam>
/// <typeparam name="A3"></typeparam>
/// <typeparam name="A4"></typeparam>
/// <typeparam name="A5"></typeparam>
/// <typeparam name="A6"></typeparam>
/// <typeparam name="A7"></typeparam>
/// <param name="name"></param>
/// <param name="f"></param>
public void RegisterFunction<RT, A1, A2, A3, A4, A5, A6, A7>(string name, Func<A1, A2, A3, A4, A5, A6, A7, RT> f)
{
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2, A3, A4, A5, A6, A7>(f).Execute;
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
}
/// <summary>
/// Register UDF with 8 input arguments, e.g:
/// SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg8) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg8 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName8)");
/// </summary>
/// <typeparam name="RT"></typeparam>
/// <typeparam name="A1"></typeparam>
/// <typeparam name="A2"></typeparam>
/// <typeparam name="A3"></typeparam>
/// <typeparam name="A4"></typeparam>
/// <typeparam name="A5"></typeparam>
/// <typeparam name="A6"></typeparam>
/// <typeparam name="A7"></typeparam>
/// <typeparam name="A8"></typeparam>
/// <param name="name"></param>
/// <param name="f"></param>
public void RegisterFunction<RT, A1, A2, A3, A4, A5, A6, A7, A8>(string name, Func<A1, A2, A3, A4, A5, A6, A7, A8, RT> f)
{
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2, A3, A4, A5, A6, A7, A8>(f).Execute;
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
}
/// <summary>
/// Register UDF with 9 input arguments, e.g:
/// SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg9) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg9 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName9)");
/// </summary>
/// <typeparam name="RT"></typeparam>
/// <typeparam name="A1"></typeparam>
/// <typeparam name="A2"></typeparam>
/// <typeparam name="A3"></typeparam>
/// <typeparam name="A4"></typeparam>
/// <typeparam name="A5"></typeparam>
/// <typeparam name="A6"></typeparam>
/// <typeparam name="A7"></typeparam>
/// <typeparam name="A8"></typeparam>
/// <typeparam name="A9"></typeparam>
/// <param name="name"></param>
/// <param name="f"></param>
public void RegisterFunction<RT, A1, A2, A3, A4, A5, A6, A7, A8, A9>(string name, Func<A1, A2, A3, A4, A5, A6, A7, A8, A9, RT> f)
{
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2, A3, A4, A5, A6, A7, A8, A9>(f).Execute;
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
}
/// <summary>
/// Register UDF with 10 input arguments, e.g:
/// SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg10) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg10 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName10)");
/// </summary>
/// <typeparam name="RT"></typeparam>
/// <typeparam name="A1"></typeparam>
/// <typeparam name="A2"></typeparam>
/// <typeparam name="A3"></typeparam>
/// <typeparam name="A4"></typeparam>
/// <typeparam name="A5"></typeparam>
/// <typeparam name="A6"></typeparam>
/// <typeparam name="A7"></typeparam>
/// <typeparam name="A8"></typeparam>
/// <typeparam name="A9"></typeparam>
/// <typeparam name="A10"></typeparam>
/// <param name="name"></param>
/// <param name="f"></param>
public void RegisterFunction<RT, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10>(string name, Func<A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT> f)
{
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10>(f).Execute;
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
}
#endregion
}
}

Просмотреть файл

@ -153,7 +153,7 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <returns></returns>
public DStream<T> Reduce(Func<T, T, T> f)
{
return Map<KeyValuePair<string, T>>(x => new KeyValuePair<string, T>(string.Empty, x)).ReduceByKey(f, 1).Map<T>(kvp => kvp.Value);
return Map<Tuple<string, T>>(x => new Tuple<string, T>(string.Empty, x)).ReduceByKey(f, 1).Map<T>(kvp => kvp.Item2);
}
/// <summary>
@ -235,9 +235,9 @@ namespace Microsoft.Spark.CSharp.Streaming
/// distinct value in each RDD of this DStream.
/// </summary>
/// <returns></returns>
public DStream<KeyValuePair<T, long>> CountByValue(int numPartitions = 0)
public DStream<Tuple<T, long>> CountByValue(int numPartitions = 0)
{
return Map(v => new KeyValuePair<T, long>(v, 1L)).ReduceByKey((x, y) => x + y, numPartitions);
return Map(v => new Tuple<T, long>(v, 1L)).ReduceByKey((x, y) => x + y, numPartitions);
}
/// <summary>
@ -427,9 +427,9 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <returns></returns>
public DStream<T> ReduceByWindow(Func<T, T, T> reduceFunc, Func<T, T, T> invReduceFunc, int windowSeconds, int slideSeconds = 0)
{
var keyed = Map(v => new KeyValuePair<int, T>(1, v));
var keyed = Map(v => new Tuple<int, T>(1, v));
var reduced = keyed.ReduceByKeyAndWindow(reduceFunc, invReduceFunc, windowSeconds, slideSeconds, 1);
return reduced.Map(kv => (T)kv.Value);
return reduced.Map(kv => (T)kv.Item2);
}
/// <summary>
@ -462,9 +462,9 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <returns></returns>
public DStream<long> CountByValueAndWindow(int windowSeconds, int slideSeconds, int numPartitions = 0)
{
var keyed = Map(v => new KeyValuePair<T, int>(v, 1));
var keyed = Map(v => new Tuple<T, int>(v, 1));
var counted = keyed.ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y, windowSeconds, slideSeconds, numPartitions);
return counted.Filter(kv => kv.Value > 0).Count();
return counted.Filter(kv => kv.Item2 > 0).Count();
}
}

Просмотреть файл

@ -35,7 +35,7 @@ namespace Microsoft.Spark.CSharp.Streaming
/// </param>
/// <param name="storageLevelType">Storage level, by default it is MEMORY_ONLY</param>
/// <returns>DStream with byte[] representing events from EventHub</returns>
public static DStream<byte[]> CreateUnionStream(StreamingContext ssc, Dictionary<string, string> eventhubsParams, StorageLevelType storageLevelType = StorageLevelType.MEMORY_ONLY)
public static DStream<byte[]> CreateUnionStream(StreamingContext ssc, IEnumerable<Tuple<string, string>> eventhubsParams, StorageLevelType storageLevelType = StorageLevelType.MEMORY_ONLY)
{
return new DStream<byte[]>(ssc.streamingContextProxy.EventHubsUnionStream(eventhubsParams, storageLevelType), ssc, SerializedMode.None);
}

Просмотреть файл

@ -28,7 +28,7 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="topics">Dict of (topic_name -> numPartitions) to consume. Each partition is consumed in its own thread.</param>
/// <param name="kafkaParams">Additional params for Kafka</param>
/// <returns>A DStream object</returns>
public static DStream<KeyValuePair<byte[], byte[]>> CreateStream(StreamingContext ssc, string zkQuorum, string groupId, Dictionary<string, int> topics, Dictionary<string, string> kafkaParams)
public static DStream<Tuple<byte[], byte[]>> CreateStream(StreamingContext ssc, string zkQuorum, string groupId, IEnumerable<Tuple<string, int>> topics, IEnumerable<Tuple<string, string>> kafkaParams)
{
return CreateStream(ssc, zkQuorum, groupId, topics, kafkaParams, StorageLevelType.MEMORY_AND_DISK_SER_2);
}
@ -43,19 +43,21 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="kafkaParams">Additional params for Kafka</param>
/// <param name="storageLevelType">RDD storage level.</param>
/// <returns>A DStream object</returns>
public static DStream<KeyValuePair<byte[], byte[]>> CreateStream(StreamingContext ssc, string zkQuorum, string groupId, Dictionary<string, int> topics, Dictionary<string, string> kafkaParams, StorageLevelType storageLevelType)
public static DStream<Tuple<byte[], byte[]>> CreateStream(StreamingContext ssc, string zkQuorum, string groupId, IEnumerable<Tuple<string, int>> topics, IEnumerable<Tuple<string, string>> kafkaParams, StorageLevelType storageLevelType)
{
if (kafkaParams == null)
kafkaParams = new Dictionary<string, string>();
kafkaParams = new List<Tuple<string, string>>();
var kafkaParamsMap = kafkaParams.ToDictionary(x => x.Item1, x => x.Item2);
if (!string.IsNullOrEmpty(zkQuorum))
kafkaParams["zookeeper.connect"] = zkQuorum;
kafkaParamsMap["zookeeper.connect"] = zkQuorum;
if (groupId != null)
kafkaParams["group.id"] = groupId;
if (kafkaParams.ContainsKey("zookeeper.connection.timeout.ms"))
kafkaParams["zookeeper.connection.timeout.ms"] = "10000";
kafkaParamsMap["group.id"] = groupId;
if (kafkaParamsMap.ContainsKey("zookeeper.connection.timeout.ms"))
kafkaParamsMap["zookeeper.connection.timeout.ms"] = "10000";
return new DStream<KeyValuePair<byte[], byte[]>>(ssc.streamingContextProxy.KafkaStream(topics, kafkaParams, storageLevelType), ssc);
return new DStream<Tuple<byte[], byte[]>>(ssc.streamingContextProxy.KafkaStream(topics, kafkaParamsMap.Select(x => Tuple.Create(x.Key, x.Value)), storageLevelType), ssc);
}
/// <summary>
@ -82,7 +84,7 @@ namespace Microsoft.Spark.CSharp.Streaming
/// </param>
/// <param name="fromOffsets">Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream.</param>
/// <returns>A DStream object</returns>
public static DStream<KeyValuePair<byte[], byte[]>> CreateDirectStream(StreamingContext ssc, List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets)
public static DStream<Tuple<byte[], byte[]>> CreateDirectStream(StreamingContext ssc, List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams, IEnumerable<Tuple<string, long>> fromOffsets)
{
int numPartitions = GetNumPartitionsFromConfig(ssc, topics, kafkaParams);
if (numPartitions >= 0 ||
@ -90,9 +92,9 @@ namespace Microsoft.Spark.CSharp.Streaming
ssc.SparkContext.SparkConf.SparkConfProxy.GetInt("spark.mobius.streaming.kafka.numReceivers", 0) > 0 ||
topics.Any(topic => ssc.SparkContext.SparkConf.SparkConfProxy.GetInt("spark.mobius.streaming.kafka.maxMessagesPerTask." + topic, 0) > 0))
{
return new DStream<KeyValuePair<byte[], byte[]>>(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, null, null), ssc, SerializedMode.Pair);
return new DStream<Tuple<byte[], byte[]>>(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, null, null), ssc, SerializedMode.Pair);
}
return new DStream<KeyValuePair<byte[], byte[]>>(ssc.streamingContextProxy.DirectKafkaStream(topics, kafkaParams, fromOffsets), ssc, SerializedMode.Pair);
return new DStream<Tuple<byte[], byte[]>>(ssc.streamingContextProxy.DirectKafkaStream(topics, kafkaParams, fromOffsets), ssc, SerializedMode.Pair);
}
/// <summary>
@ -120,18 +122,18 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="fromOffsets">Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream.</param>
/// <param name="readFunc">user function to process the kafka data.</param>
/// <returns>A DStream object</returns>
public static DStream<T> CreateDirectStream<T>(StreamingContext ssc, List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets, Func<int, IEnumerable<KeyValuePair<byte[], byte[]>>, IEnumerable<T>> readFunc)
public static DStream<T> CreateDirectStream<T>(StreamingContext ssc, List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams, IEnumerable<Tuple<string, long>> fromOffsets, Func<int, IEnumerable<Tuple<byte[], byte[]>>, IEnumerable<T>> readFunc)
{
int numPartitions = GetNumPartitionsFromConfig(ssc, topics, kafkaParams);
if (ssc.SparkContext.SparkConf.SparkConfProxy.GetInt("spark.mobius.streaming.kafka.numReceivers", 0) <= 0)
{
var dstream = new DStream<KeyValuePair<byte[], byte[]>>(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, null, null), ssc, SerializedMode.Pair);
var dstream = new DStream<Tuple<byte[], byte[]>>(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, null, null), ssc, SerializedMode.Pair);
return dstream.MapPartitionsWithIndex(readFunc, true);
}
var mapPartitionsWithIndexHelper = new MapPartitionsWithIndexHelper<KeyValuePair<byte[], byte[]>, T>(readFunc, true);
var transformHelper = new TransformHelper<KeyValuePair<byte[], byte[]>, T>(mapPartitionsWithIndexHelper.Execute);
var transformDynamicHelper = new TransformDynamicHelper<KeyValuePair<byte[], byte[]>, T>(transformHelper.Execute);
var mapPartitionsWithIndexHelper = new MapPartitionsWithIndexHelper<Tuple<byte[], byte[]>, T>(readFunc, true);
var transformHelper = new TransformHelper<Tuple<byte[], byte[]>, T>(mapPartitionsWithIndexHelper.Execute);
var transformDynamicHelper = new TransformDynamicHelper<Tuple<byte[], byte[]>, T>(transformHelper.Execute);
Func<double, RDD<dynamic>, RDD<dynamic>> func = transformDynamicHelper.Execute;
var formatter = new BinaryFormatter();
var stream = new MemoryStream();
@ -146,11 +148,11 @@ namespace Microsoft.Spark.CSharp.Streaming
/// </summary>
/// <param name="input"></param>
/// <returns></returns>
public static OffsetRange GetOffsetRange(IEnumerable<KeyValuePair<byte[], byte[]>> input)
public static OffsetRange GetOffsetRange(IEnumerable<Tuple<byte[], byte[]>> input)
{
int count = 2;
int i = 0;
var offsetRange = new KeyValuePair<byte[], byte[]>[count];
var offsetRange = new Tuple<byte[], byte[]>[count];
foreach (var message in input)
{
offsetRange[i++ % count] = message;
@ -163,12 +165,12 @@ namespace Microsoft.Spark.CSharp.Streaming
throw new ArgumentException("Expecting kafka OffsetRange metadata.");
}
var topicAndClusterId = SerDe.ToString(offsetRange[0].Key);
var topicAndClusterId = SerDe.ToString(offsetRange[0].Item1);
var topic = topicAndClusterId.Split(',')[0];
var clusterId = topicAndClusterId.Split(',')[1];
var partition = SerDe.ToInt(offsetRange[0].Value);
var fromOffset = SerDe.ReadLong(new MemoryStream(offsetRange[1].Key));
var untilOffset = SerDe.ReadLong(new MemoryStream(offsetRange[1].Value));
var partition = SerDe.ToInt(offsetRange[0].Item2);
var fromOffset = SerDe.ReadLong(new MemoryStream(offsetRange[1].Item1));
var untilOffset = SerDe.ReadLong(new MemoryStream(offsetRange[1].Item2));
return new OffsetRange(topic, clusterId, partition, fromOffset, untilOffset);
}
@ -181,12 +183,13 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="topics"></param>
/// <param name="kafkaParams"></param>
/// <returns></returns>
private static int GetNumPartitionsFromConfig(StreamingContext ssc, List<string> topics, Dictionary<string, string> kafkaParams)
private static int GetNumPartitionsFromConfig(StreamingContext ssc, List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams)
{
if (topics == null || topics.Count == 0)
return -1;
string clusterId = kafkaParams.ContainsKey("cluster.id") ? "." + kafkaParams["cluster.id"] : null;
var kafkaParamsMap = kafkaParams.ToDictionary(x => x.Item1, x => x.Item2);
string clusterId = kafkaParamsMap.ContainsKey("cluster.id") ? "." + kafkaParamsMap["cluster.id"] : null;
return ssc.SparkContext.SparkConf.SparkConfProxy.GetInt("spark.mobius.streaming.kafka.numPartitions." + topics[0] + clusterId, -1);
}
}

Просмотреть файл

@ -29,9 +29,9 @@ namespace Microsoft.Spark.CSharp.Streaming
[Serializable]
public class MapWithStateDStream<K, V, S, M> : DStream<M>
{
internal DStream<KeyValuePair<K, S>> snapshotsDStream;
internal DStream<Tuple<K, S>> snapshotsDStream;
internal MapWithStateDStream(DStream<M> mappedDataDStream, DStream<KeyValuePair<K, S>> snapshotsDStream)
internal MapWithStateDStream(DStream<M> mappedDataDStream, DStream<Tuple<K, S>> snapshotsDStream)
: base(mappedDataDStream.DStreamProxy, mappedDataDStream.streamingContext)
{
this.snapshotsDStream = snapshotsDStream;
@ -40,7 +40,7 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <summary>
/// Return a pair DStream where each RDD is the snapshot of the state of all the keys.
/// </summary>
public DStream<KeyValuePair<K, S>> StateSnapshots()
public DStream<Tuple<K, S>> StateSnapshots()
{
return snapshotsDStream;
}
@ -87,11 +87,11 @@ namespace Microsoft.Spark.CSharp.Streaming
{
}
public MapWithStateRDDRecord(long t, IEnumerable<KeyValuePair<K, S>> iter)
public MapWithStateRDDRecord(long t, IEnumerable<Tuple<K, S>> iter)
{
foreach (var p in iter)
{
stateMap[p.Key] = new KeyedState<S>(p.Value, t);
stateMap[p.Item1] = new KeyedState<S>(p.Item2, t);
}
}
}
@ -131,14 +131,14 @@ namespace Microsoft.Spark.CSharp.Streaming
while (enumerator.MoveNext())
{
KeyValuePair<K, V> kv = enumerator.Current;
Tuple<K, V> kv = enumerator.Current;
KeyedState<S> keyedState;
State<S> wrappedState = stateRddRecord.stateMap.TryGetValue(kv.Key, out keyedState) ? new State<S>(keyedState.state) : new State<S>(default(S));
State<S> wrappedState = stateRddRecord.stateMap.TryGetValue(kv.Item1, out keyedState) ? new State<S>(keyedState.state) : new State<S>(default(S));
var mappedData = default(M);
try
{
mappedData = f(kv.Key, kv.Value, wrappedState);
mappedData = f(kv.Item1, kv.Item2, wrappedState);
}
catch (Exception e)
{
@ -149,11 +149,11 @@ namespace Microsoft.Spark.CSharp.Streaming
if (wrappedState.removed)
{
stateRddRecord.stateMap.Remove(kv.Key);
stateRddRecord.stateMap.Remove(kv.Item1);
}
else if (wrappedState.updated || wrappedState.defined)
{
stateRddRecord.stateMap[kv.Key] = new KeyedState<S>(wrappedState.state, ticks);
stateRddRecord.stateMap[kv.Item1] = new KeyedState<S>(wrappedState.state, ticks);
}
}
@ -223,7 +223,7 @@ namespace Microsoft.Spark.CSharp.Streaming
valuesRDD = prevFunc(t, valuesRDD);
}
var values = valuesRDD.ConvertTo<KeyValuePair<K, V>>().PartitionBy(stateSpec.numPartitions);
var values = valuesRDD.ConvertTo<Tuple<K, V>>().PartitionBy(stateSpec.numPartitions);
if (stateRDD == null)
{
@ -259,12 +259,12 @@ namespace Microsoft.Spark.CSharp.Streaming
this.ticks = ticks;
}
internal IEnumerable<MapWithStateRDDRecord<K, S, M>> Execute(IEnumerable<KeyValuePair<K, S>> iter)
internal IEnumerable<MapWithStateRDDRecord<K, S, M>> Execute(IEnumerable<Tuple<K, S>> iter)
{
return new[] {new MapWithStateRDDRecord<K, S, M>(ticks, iter)};
}
internal IEnumerable<MapWithStateRDDRecord<K, S, M>> ExecuteWithoutInitialState(IEnumerable<KeyValuePair<K, V>> iter)
internal IEnumerable<MapWithStateRDDRecord<K, S, M>> ExecuteWithoutInitialState(IEnumerable<Tuple<K, V>> iter)
{
return new[] { new MapWithStateRDDRecord<K, S, M>() };
}
@ -283,7 +283,7 @@ namespace Microsoft.Spark.CSharp.Streaming
internal Func<K, V, State<S>, M> mappingFunction;
internal int numPartitions;
internal TimeSpan idleDuration = TimeSpan.FromTicks(0);
internal RDD<KeyValuePair<K, S>> initialState = null;
internal RDD<Tuple<K, S>> initialState = null;
/// <summary>
/// Create a StateSpec for setting all the specifications of the `mapWithState` operation on a pair DStream.
@ -325,7 +325,7 @@ namespace Microsoft.Spark.CSharp.Streaming
/// </summary>
/// <param name="initialState">The given initial state</param>
/// <returns>The new StateSpec object</returns>
public StateSpec<K, V, S, M> InitialState(RDD<KeyValuePair<K, S>> initialState)
public StateSpec<K, V, S, M> InitialState(RDD<Tuple<K, S>> initialState)
{
this.initialState = initialState;
return this;

Просмотреть файл

@ -15,7 +15,7 @@ using Microsoft.Spark.CSharp.Interop;
namespace Microsoft.Spark.CSharp.Streaming
{
/// <summary>
/// operations only available to KeyValuePair RDD
/// operations only available to Tuple RDD
/// </summary>
public static class PairDStreamFunctions
{
@ -28,7 +28,7 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="reduceFunc"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static DStream<KeyValuePair<K, V>> ReduceByKey<K, V>(this DStream<KeyValuePair<K, V>> self, Func<V, V, V> reduceFunc, int numPartitions = 0)
public static DStream<Tuple<K, V>> ReduceByKey<K, V>(this DStream<Tuple<K, V>> self, Func<V, V, V> reduceFunc, int numPartitions = 0)
{
return self.CombineByKey(() => default(V), reduceFunc, reduceFunc, numPartitions);
}
@ -45,8 +45,8 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="mergeCombiners"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static DStream<KeyValuePair<K, C>> CombineByKey<K, V, C>(
this DStream<KeyValuePair<K, V>> self,
public static DStream<Tuple<K, C>> CombineByKey<K, V, C>(
this DStream<Tuple<K, V>> self,
Func<C> createCombiner,
Func<C, V, C> mergeValue,
Func<C, C, C> mergeCombiners,
@ -55,7 +55,7 @@ namespace Microsoft.Spark.CSharp.Streaming
if (numPartitions <= 0)
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
return self.Transform<KeyValuePair<K, C>>(new CombineByKeyHelper<K, V, C>(createCombiner, mergeValue, mergeCombiners, numPartitions).Execute);
return self.Transform<Tuple<K, C>>(new CombineByKeyHelper<K, V, C>(createCombiner, mergeValue, mergeCombiners, numPartitions).Execute);
}
/// <summary>
@ -66,12 +66,12 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="self"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static DStream<KeyValuePair<K, V>> PartitionBy<K, V>(this DStream<KeyValuePair<K, V>> self, int numPartitions = 0)
public static DStream<Tuple<K, V>> PartitionBy<K, V>(this DStream<Tuple<K, V>> self, int numPartitions = 0)
{
if (numPartitions <= 0)
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
return self.Transform<KeyValuePair<K, V>>(new PartitionByHelper<K, V>(numPartitions).Execute);
return self.Transform<Tuple<K, V>>(new PartitionByHelper<K, V>(numPartitions).Execute);
}
/// <summary>
@ -84,7 +84,7 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="self"></param>
/// <param name="func"></param>
/// <returns></returns>
public static DStream<KeyValuePair<K, U>> MapValues<K, V, U>(this DStream<KeyValuePair<K, V>> self, Func<V, U> func)
public static DStream<Tuple<K, U>> MapValues<K, V, U>(this DStream<Tuple<K, V>> self, Func<V, U> func)
{
return self.Map(new MapValuesHelper<K, V, U>(func).Execute, true);
}
@ -99,7 +99,7 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="self"></param>
/// <param name="func"></param>
/// <returns></returns>
public static DStream<KeyValuePair<K, U>> FlatMapValues<K, V, U>(this DStream<KeyValuePair<K, V>> self, Func<V, IEnumerable<U>> func)
public static DStream<Tuple<K, U>> FlatMapValues<K, V, U>(this DStream<Tuple<K, V>> self, Func<V, IEnumerable<U>> func)
{
return self.FlatMap(new FlatMapValuesHelper<K, V, U>(func).Execute, true);
}
@ -112,9 +112,9 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="self"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static DStream<KeyValuePair<K, List<V>>> GroupByKey<K, V>(this DStream<KeyValuePair<K, V>> self, int numPartitions = 0)
public static DStream<Tuple<K, List<V>>> GroupByKey<K, V>(this DStream<Tuple<K, V>> self, int numPartitions = 0)
{
return self.Transform<KeyValuePair<K, List<V>>>(new GroupByKeyHelper<K, V>(numPartitions).Execute);
return self.Transform<Tuple<K, List<V>>>(new GroupByKeyHelper<K, V>(numPartitions).Execute);
}
/// <summary>
@ -128,12 +128,12 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="other"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static DStream<KeyValuePair<K, Tuple<List<V>, List<W>>>> GroupWith<K, V, W>(this DStream<KeyValuePair<K, V>> self, DStream<KeyValuePair<K, W>> other, int numPartitions = 0)
public static DStream<Tuple<K, Tuple<List<V>, List<W>>>> GroupWith<K, V, W>(this DStream<Tuple<K, V>> self, DStream<Tuple<K, W>> other, int numPartitions = 0)
{
if (numPartitions <= 0)
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
return self.TransformWith<KeyValuePair<K, W>, KeyValuePair<K, Tuple<List<V>, List<W>>>>(new GroupWithHelper<K, V, W>(numPartitions).Execute, other);
return self.TransformWith<Tuple<K, W>, Tuple<K, Tuple<List<V>, List<W>>>>(new GroupWithHelper<K, V, W>(numPartitions).Execute, other);
}
/// <summary>
@ -147,12 +147,12 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="other"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static DStream<KeyValuePair<K, Tuple<V, W>>> Join<K, V, W>(this DStream<KeyValuePair<K, V>> self, DStream<KeyValuePair<K, W>> other, int numPartitions = 0)
public static DStream<Tuple<K, Tuple<V, W>>> Join<K, V, W>(this DStream<Tuple<K, V>> self, DStream<Tuple<K, W>> other, int numPartitions = 0)
{
if (numPartitions <= 0)
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
return self.TransformWith<KeyValuePair<K, W>, KeyValuePair<K, Tuple<V, W>>>(new JoinHelper<K, V, W>(numPartitions).Execute, other);
return self.TransformWith<Tuple<K, W>, Tuple<K, Tuple<V, W>>>(new JoinHelper<K, V, W>(numPartitions).Execute, other);
}
/// <summary>
@ -166,12 +166,12 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="other"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static DStream<KeyValuePair<K, Tuple<V, Option<W>>>> LeftOuterJoin<K, V, W>(this DStream<KeyValuePair<K, V>> self, DStream<KeyValuePair<K, W>> other, int numPartitions = 0)
public static DStream<Tuple<K, Tuple<V, Option<W>>>> LeftOuterJoin<K, V, W>(this DStream<Tuple<K, V>> self, DStream<Tuple<K, W>> other, int numPartitions = 0)
{
if (numPartitions <= 0)
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
return self.TransformWith<KeyValuePair<K, W>, KeyValuePair<K, Tuple<V, Option<W>>>>(new LeftOuterJoinHelper<K, V, W>(numPartitions).Execute, other);
return self.TransformWith<Tuple<K, W>, Tuple<K, Tuple<V, Option<W>>>>(new LeftOuterJoinHelper<K, V, W>(numPartitions).Execute, other);
}
/// <summary>
@ -185,12 +185,12 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="other"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static DStream<KeyValuePair<K, Tuple<Option<V>, W>>> RightOuterJoin<K, V, W>(this DStream<KeyValuePair<K, V>> self, DStream<KeyValuePair<K, W>> other, int numPartitions = 0)
public static DStream<Tuple<K, Tuple<Option<V>, W>>> RightOuterJoin<K, V, W>(this DStream<Tuple<K, V>> self, DStream<Tuple<K, W>> other, int numPartitions = 0)
{
if (numPartitions <= 0)
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
return self.TransformWith<KeyValuePair<K, W>, KeyValuePair<K, Tuple<Option<V>, W>>>(new RightOuterJoinHelper<K, V, W>(numPartitions).Execute, other);
return self.TransformWith<Tuple<K, W>, Tuple<K, Tuple<Option<V>, W>>>(new RightOuterJoinHelper<K, V, W>(numPartitions).Execute, other);
}
/// <summary>
@ -204,12 +204,12 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="other"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static DStream<KeyValuePair<K, Tuple<Option<V>, Option<W>>>> FullOuterJoin<K, V, W>(this DStream<KeyValuePair<K, V>> self, DStream<KeyValuePair<K, W>> other, int numPartitions = 0)
public static DStream<Tuple<K, Tuple<Option<V>, Option<W>>>> FullOuterJoin<K, V, W>(this DStream<Tuple<K, V>> self, DStream<Tuple<K, W>> other, int numPartitions = 0)
{
if (numPartitions <= 0)
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
return self.TransformWith<KeyValuePair<K, W>, KeyValuePair<K, Tuple<Option<V>, Option<W>>>>(new FullOuterJoinHelper<K, V, W>(numPartitions).Execute, other);
return self.TransformWith<Tuple<K, W>, Tuple<K, Tuple<Option<V>, Option<W>>>>(new FullOuterJoinHelper<K, V, W>(numPartitions).Execute, other);
}
/// <summary>
@ -227,7 +227,7 @@ namespace Microsoft.Spark.CSharp.Streaming
/// </param>
/// <param name="numPartitions">Number of partitions of each RDD in the new DStream.</param>
/// <returns></returns>
public static DStream<KeyValuePair<K, IEnumerable<V>>> GroupByKeyAndWindow<K, V>(this DStream<KeyValuePair<K, V>> self,
public static DStream<Tuple<K, IEnumerable<V>>> GroupByKeyAndWindow<K, V>(this DStream<Tuple<K, V>> self,
int windowSeconds, int slideSeconds, int numPartitions = 0)
{
var ls = self.MapValues(x => new List<V> { x });
@ -259,13 +259,13 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="numPartitions">number of partitions of each RDD in the new DStream.</param>
/// <param name="filterFunc">function to filter expired key-value pairs; only pairs that satisfy the function are retained set this to null if you do not want to filter</param>
/// <returns></returns>
public static DStream<KeyValuePair<K, V>> ReduceByKeyAndWindow<K, V>(this DStream<KeyValuePair<K, V>> self,
public static DStream<Tuple<K, V>> ReduceByKeyAndWindow<K, V>(this DStream<Tuple<K, V>> self,
Func<V, V, V> reduceFunc,
Func<V, V, V> invReduceFunc,
int windowSeconds,
int slideSeconds = 0,
int numPartitions = 0,
Func<KeyValuePair<K, V>, bool> filterFunc = null)
Func<Tuple<K, V>, bool> filterFunc = null)
{
self.ValidateWindowParam(windowSeconds, slideSeconds);
@ -294,7 +294,7 @@ namespace Microsoft.Spark.CSharp.Streaming
formatter.Serialize(invStream, invReduceF);
}
return new DStream<KeyValuePair<K, V>>(
return new DStream<Tuple<K, V>>(
SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpReducedWindowedDStream(
reduced.DStreamProxy,
stream.ToArray(),
@ -321,8 +321,8 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="initialState">Initial state value of each key</param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static DStream<KeyValuePair<K, S>> UpdateStateByKey<K, V, S>(this DStream<KeyValuePair<K, V>> self,
Func<IEnumerable<V>, S, S> updateFunc, RDD<KeyValuePair<K, S>> initialState = null,
public static DStream<Tuple<K, S>> UpdateStateByKey<K, V, S>(this DStream<Tuple<K, V>> self,
Func<IEnumerable<V>, S, S> updateFunc, RDD<Tuple<K, S>> initialState = null,
int numPartitions = 0)
{
return UpdateStateByKey<K, V, S>(self, new UpdateStateByKeyHelper<K, V, S>(updateFunc).Execute, initialState, numPartitions);
@ -340,11 +340,11 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="initialState">Initial state value of each key</param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static DStream<KeyValuePair<K, S>> UpdateStateByKey<K, V, S>(this DStream<KeyValuePair<K, V>> self,
Func<IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> updateFunc, RDD<KeyValuePair<K, S>> initialState = null,
public static DStream<Tuple<K, S>> UpdateStateByKey<K, V, S>(this DStream<Tuple<K, V>> self,
Func<IEnumerable<Tuple<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<Tuple<K, S>>> updateFunc, RDD<Tuple<K, S>> initialState = null,
int numPartitions = 0)
{
return UpdateStateByKey<K, V, S>(self, new MapPartitionsHelper<KeyValuePair<K, Tuple<IEnumerable<V>, S>>, KeyValuePair<K, S>>(updateFunc).Execute, initialState, numPartitions);
return UpdateStateByKey<K, V, S>(self, new MapPartitionsHelper<Tuple<K, Tuple<IEnumerable<V>, S>>, Tuple<K, S>>(updateFunc).Execute, initialState, numPartitions);
}
/// <summary>
@ -359,9 +359,9 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <param name="initialState">Initial state value of each key</param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static DStream<KeyValuePair<K, S>> UpdateStateByKey<K, V, S>(this DStream<KeyValuePair<K, V>> self,
Func<int, IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> updateFunc,
RDD<KeyValuePair<K, S>> initialState = null, int numPartitions = 0)
public static DStream<Tuple<K, S>> UpdateStateByKey<K, V, S>(this DStream<Tuple<K, V>> self,
Func<int, IEnumerable<Tuple<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<Tuple<K, S>>> updateFunc,
RDD<Tuple<K, S>> initialState = null, int numPartitions = 0)
{
if (numPartitions <= 0)
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
@ -377,7 +377,7 @@ namespace Microsoft.Spark.CSharp.Streaming
var stream = new MemoryStream();
formatter.Serialize(stream, func);
return new DStream<KeyValuePair<K, S>>(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpStateDStream(
return new DStream<Tuple<K, S>>(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpStateDStream(
ds.DStreamProxy,
stream.ToArray(),
"CSharpStateDStream",
@ -390,14 +390,14 @@ namespace Microsoft.Spark.CSharp.Streaming
/// Return a new "state" DStream where the state for each key is updated by applying
/// the given function on the previous state of the key and the new values of the key.
/// </summary>
public static MapWithStateDStream<K, V, S, M> MapWithState<K, V, S, M>(this DStream<KeyValuePair<K, V>> self, StateSpec<K, V, S, M> stateSpec)
public static MapWithStateDStream<K, V, S, M> MapWithState<K, V, S, M>(this DStream<Tuple<K, V>> self, StateSpec<K, V, S, M> stateSpec)
{
if (stateSpec.numPartitions <= 0)
{
stateSpec = stateSpec.NumPartitions(self.streamingContext.SparkContext.DefaultParallelism);
}
Func<double, RDD<dynamic>, RDD<dynamic>> prevFunc = self.Piplinable ? (self as TransformedDStream<KeyValuePair<K, V>>).func : null;
Func<double, RDD<dynamic>, RDD<dynamic>> prevFunc = self.Piplinable ? (self as TransformedDStream<Tuple<K, V>>).func : null;
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> func = new MapWithStateHelper<K, V, S, M>(prevFunc, stateSpec).Execute;
@ -414,8 +414,8 @@ namespace Microsoft.Spark.CSharp.Streaming
self.streamingContext);
DStream<M> mappedDataDStream = mapWithStateDStream.FlatMap(r => r.mappedData);
DStream<KeyValuePair<K, S>> snapshotsDStream = mapWithStateDStream.FlatMap(
r => r.stateMap.Select(entry => new KeyValuePair<K, S>(entry.Key, entry.Value.state)));
DStream<Tuple<K, S>> snapshotsDStream = mapWithStateDStream.FlatMap(
r => r.stateMap.Select(entry => new Tuple<K, S>(entry.Key, entry.Value.state)));
return new MapWithStateDStream<K, V, S, M>(mappedDataDStream, snapshotsDStream);
}
@ -443,7 +443,7 @@ namespace Microsoft.Spark.CSharp.Streaming
this.numPartitions = numPartitions;
}
internal RDD<KeyValuePair<K, C>> Execute(RDD<KeyValuePair<K, V>> rdd)
internal RDD<Tuple<K, C>> Execute(RDD<Tuple<K, V>> rdd)
{
return rdd.CombineByKey(createCombiner, mergeValue, mergeCombiners, numPartitions);
}
@ -458,7 +458,7 @@ namespace Microsoft.Spark.CSharp.Streaming
this.numPartitions = numPartitions;
}
internal RDD<KeyValuePair<K, V>> Execute(RDD<KeyValuePair<K, V>> rdd)
internal RDD<Tuple<K, V>> Execute(RDD<Tuple<K, V>> rdd)
{
return rdd.PartitionBy(numPartitions);
}
@ -473,7 +473,7 @@ namespace Microsoft.Spark.CSharp.Streaming
this.numPartitions = numPartitions;
}
internal RDD<byte[]> Execute(RDD<KeyValuePair<K, V>> rdd)
internal RDD<byte[]> Execute(RDD<Tuple<K, V>> rdd)
{
var keyed = rdd.MapPartitionsWithIndex(new PairRDDFunctions.AddShuffleKeyHelper<K, V>(numPartitions).Execute, true);
keyed.bypassSerializer = true;
@ -492,9 +492,9 @@ namespace Microsoft.Spark.CSharp.Streaming
func = f;
}
internal KeyValuePair<K, U> Execute(KeyValuePair<K, V> kvp)
internal Tuple<K, U> Execute(Tuple<K, V> kvp)
{
return new KeyValuePair<K, U>(kvp.Key, func(kvp.Value));
return new Tuple<K, U>(kvp.Item1, func(kvp.Item2));
}
}
@ -507,9 +507,9 @@ namespace Microsoft.Spark.CSharp.Streaming
func = f;
}
internal IEnumerable<KeyValuePair<K, U>> Execute(KeyValuePair<K, V> kvp)
internal IEnumerable<Tuple<K, U>> Execute(Tuple<K, V> kvp)
{
return func(kvp.Value).Select(v => new KeyValuePair<K, U>(kvp.Key, v));
return func(kvp.Item2).Select(v => new Tuple<K, U>(kvp.Item1, v));
}
}
@ -522,7 +522,7 @@ namespace Microsoft.Spark.CSharp.Streaming
this.numPartitions = numPartitions;
}
internal RDD<KeyValuePair<K, List<V>>> Execute(RDD<KeyValuePair<K, V>> rdd)
internal RDD<Tuple<K, List<V>>> Execute(RDD<Tuple<K, V>> rdd)
{
return rdd.GroupByKey(numPartitions);
}
@ -537,7 +537,7 @@ namespace Microsoft.Spark.CSharp.Streaming
this.numPartitions = numPartitions;
}
internal RDD<KeyValuePair<K, Tuple<List<V>, List<W>>>> Execute(RDD<KeyValuePair<K, V>> l, RDD<KeyValuePair<K, W>> r)
internal RDD<Tuple<K, Tuple<List<V>, List<W>>>> Execute(RDD<Tuple<K, V>> l, RDD<Tuple<K, W>> r)
{
return l.GroupWith<K, V, W>(r, numPartitions);
}
@ -552,7 +552,7 @@ namespace Microsoft.Spark.CSharp.Streaming
this.numPartitions = numPartitions;
}
internal RDD<KeyValuePair<K, Tuple<V, W>>> Execute(RDD<KeyValuePair<K, V>> l, RDD<KeyValuePair<K, W>> r)
internal RDD<Tuple<K, Tuple<V, W>>> Execute(RDD<Tuple<K, V>> l, RDD<Tuple<K, W>> r)
{
return l.Join<K, V, W>(r, numPartitions);
}
@ -567,7 +567,7 @@ namespace Microsoft.Spark.CSharp.Streaming
this.numPartitions = numPartitions;
}
internal RDD<KeyValuePair<K, Tuple<V, Option<W>>>> Execute(RDD<KeyValuePair<K, V>> l, RDD<KeyValuePair<K, W>> r)
internal RDD<Tuple<K, Tuple<V, Option<W>>>> Execute(RDD<Tuple<K, V>> l, RDD<Tuple<K, W>> r)
{
return l.LeftOuterJoin<K, V, W>(r, numPartitions);
}
@ -582,7 +582,7 @@ namespace Microsoft.Spark.CSharp.Streaming
this.numPartitions = numPartitions;
}
internal RDD<KeyValuePair<K, Tuple<Option<V>, W>>> Execute(RDD<KeyValuePair<K, V>> l, RDD<KeyValuePair<K, W>> r)
internal RDD<Tuple<K, Tuple<Option<V>, W>>> Execute(RDD<Tuple<K, V>> l, RDD<Tuple<K, W>> r)
{
return l.RightOuterJoin<K, V, W>(r, numPartitions);
}
@ -597,7 +597,7 @@ namespace Microsoft.Spark.CSharp.Streaming
this.numPartitions = numPartitions;
}
internal RDD<KeyValuePair<K, Tuple<Option<V>, Option<W>>>> Execute(RDD<KeyValuePair<K, V>> l, RDD<KeyValuePair<K, W>> r)
internal RDD<Tuple<K, Tuple<Option<V>, Option<W>>>> Execute(RDD<Tuple<K, V>> l, RDD<Tuple<K, W>> r)
{
return l.FullOuterJoin<K, V, W>(r, numPartitions);
}
@ -609,12 +609,12 @@ namespace Microsoft.Spark.CSharp.Streaming
private readonly Func<V, V, V> reduceFunc;
private readonly Func<V, V, V> invReduceFunc;
private readonly int numPartitions;
private readonly Func<KeyValuePair<K, V>, bool> filterFunc;
private readonly Func<Tuple<K, V>, bool> filterFunc;
internal ReduceByKeyAndWindowHelper(Func<V, V, V> reduceF,
Func<V, V, V> invReduceF,
int numPartitions,
Func<KeyValuePair<K, V>, bool> filterF)
Func<Tuple<K, V>, bool> filterF)
{
reduceFunc = reduceF;
invReduceFunc = invReduceF;
@ -625,11 +625,11 @@ namespace Microsoft.Spark.CSharp.Streaming
internal RDD<dynamic> Reduce(double t, RDD<dynamic> a, RDD<dynamic> b)
{
b.partitioner = new Partitioner(numPartitions, null);
var r = b.ConvertTo<KeyValuePair<K, V>>();
var r = b.ConvertTo<Tuple<K, V>>();
if (a != null)
{
a.partitioner = b.partitioner;
r = a.ConvertTo<KeyValuePair<K, V>>().Union(r);
r = a.ConvertTo<Tuple<K, V>>().Union(r);
}
r = r.ReduceByKey<K, V>(reduceFunc, numPartitions);
if (filterFunc != null)
@ -640,8 +640,8 @@ namespace Microsoft.Spark.CSharp.Streaming
internal RDD<dynamic> InvReduce(double t, RDD<dynamic> a, RDD<dynamic> b)
{
a.partitioner = b.partitioner = new Partitioner(numPartitions, null);
var rddb = b.ConvertTo<KeyValuePair<K, V>>().ReduceByKey<K, V>(reduceFunc, numPartitions);
var rdda = a.ConvertTo<KeyValuePair<K, V>>();
var rddb = b.ConvertTo<Tuple<K, V>>().ReduceByKey<K, V>(reduceFunc, numPartitions);
var rdda = a.ConvertTo<Tuple<K, V>>();
var joined = rdda.Join<K, V, V>(rddb, numPartitions);
var r = joined.MapValues<K, Tuple<V, V>, V>(kv => kv.Item2 != null ? invReduceFunc(kv.Item1, kv.Item2) : kv.Item1);
return r.ConvertTo<dynamic>();
@ -658,21 +658,21 @@ namespace Microsoft.Spark.CSharp.Streaming
func = f;
}
internal IEnumerable<KeyValuePair<K, S>> Execute(IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>> input)
internal IEnumerable<Tuple<K, S>> Execute(IEnumerable<Tuple<K, Tuple<IEnumerable<V>, S>>> input)
{
return input.Select(x => new KeyValuePair<K, S>(x.Key, func(x.Value.Item1, x.Value.Item2)));
return input.Select(x => new Tuple<K, S>(x.Item1, func(x.Item2.Item1, x.Item2.Item2)));
}
}
[Serializable]
internal class UpdateStateByKeysHelper<K, V, S>
{
private readonly Func<int, IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> func;
private readonly RDD<KeyValuePair<K, S>> initialState;
private readonly Func<int, IEnumerable<Tuple<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<Tuple<K, S>>> func;
private readonly RDD<Tuple<K, S>> initialState;
private readonly int numPartitions;
internal UpdateStateByKeysHelper(
Func<int, IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> f,
RDD<KeyValuePair<K, S>> initialState, int numPartitions)
Func<int, IEnumerable<Tuple<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<Tuple<K, S>>> f,
RDD<Tuple<K, S>> initialState, int numPartitions)
{
func = f;
this.initialState = initialState;
@ -681,11 +681,11 @@ namespace Microsoft.Spark.CSharp.Streaming
internal RDD<dynamic> Execute(double t, RDD<dynamic> stateRDD, RDD<dynamic> valuesRDD)
{
RDD<KeyValuePair<K, S>> state = null;
RDD<KeyValuePair<K, Tuple<IEnumerable<V>, S>>> g = null;
RDD<Tuple<K, S>> state = null;
RDD<Tuple<K, Tuple<IEnumerable<V>, S>>> g = null;
// call into scala side partitionBy directly since AddShuffleKey already applied
var values = new RDD<KeyValuePair<K, V>>(valuesRDD.sparkContext.SparkContextProxy.CreatePairwiseRDD(valuesRDD.rddProxy, numPartitions, 0), valuesRDD.sparkContext);
var values = new RDD<Tuple<K, V>>(valuesRDD.sparkContext.SparkContextProxy.CreatePairwiseRDD(valuesRDD.rddProxy, numPartitions, 0), valuesRDD.sparkContext);
values.partitioner = new Partitioner(numPartitions, null);
if (stateRDD == null)
@ -706,12 +706,12 @@ namespace Microsoft.Spark.CSharp.Streaming
}
else
{
state = stateRDD.ConvertTo<KeyValuePair<K, S>>();
state = stateRDD.ConvertTo<Tuple<K, S>>();
state.partitioner = values.partitioner;
g = state.GroupWith(values, numPartitions).MapValues(x => new Tuple<IEnumerable<V>, S>(new List<V>(x.Item2), x.Item1.Count > 0 ? x.Item1[0] : default(S)));
}
state = g.MapPartitionsWithIndex((pid, iter) => func(pid, iter), true).Filter(x => x.Value != null);
state = g.MapPartitionsWithIndex((pid, iter) => func(pid, iter), true).Filter(x => x.Item2 != null);
return state.ConvertTo<dynamic>();
}

Просмотреть файл

@ -451,9 +451,9 @@
a function to sort the key.
</summary>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.OrderedRDDFunctions.SortByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Boolean,System.Nullable{System.Int32})">
<member name="M:Microsoft.Spark.CSharp.Core.OrderedRDDFunctions.SortByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Boolean,System.Nullable{System.Int32})">
<summary>
Sorts this RDD, which is assumed to consist of KeyValuePair pairs.
Sorts this RDD, which is assumed to consist of Tuple pairs.
</summary>
<typeparam name="K"></typeparam>
<typeparam name="V"></typeparam>
@ -462,9 +462,9 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.OrderedRDDFunctions.SortByKey``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Boolean,System.Nullable{System.Int32},System.Func{``0,``2})">
<member name="M:Microsoft.Spark.CSharp.Core.OrderedRDDFunctions.SortByKey``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Boolean,System.Nullable{System.Int32},System.Func{``0,``2})">
<summary>
Sorts this RDD, which is assumed to consist of KeyValuePairs. If key is type of string, case is sensitive.
Sorts this RDD, which is assumed to consist of Tuples. If Item1 is type of string, case is sensitive.
</summary>
<typeparam name="K"></typeparam>
<typeparam name="V"></typeparam>
@ -472,10 +472,10 @@
<param name="self"></param>
<param name="ascending"></param>
<param name="numPartitions">Number of partitions. Each partition of the sorted RDD contains a sorted range of the elements.</param>
<param name="keyFunc">RDD will sort by keyFunc(key) for every key in KeyValuePair. Must not be null.</param>
<param name="keyFunc">RDD will sort by keyFunc(key) for every Item1 in Tuple. Must not be null.</param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.OrderedRDDFunctions.repartitionAndSortWithinPartitions``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Nullable{System.Int32},System.Func{``0,System.Int32},System.Boolean)">
<member name="M:Microsoft.Spark.CSharp.Core.OrderedRDDFunctions.repartitionAndSortWithinPartitions``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Nullable{System.Int32},System.Func{``0,System.Int32},System.Boolean)">
<summary>
Repartition the RDD according to the given partitioner and, within each resulting partition,
sort records by their keys.
@ -493,16 +493,16 @@
</member>
<member name="T:Microsoft.Spark.CSharp.Core.PairRDDFunctions">
<summary>
operations only available to KeyValuePair RDD
operations only available to Tuple RDD
See also http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions
</summary>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.CollectAsMap``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}})">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.CollectAsMap``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}})">
<summary>
Return the key-value pairs in this RDD to the master as a dictionary.
var m = sc.Parallelize(new[] { new KeyValuePair&lt;int, int>(1, 2), new KeyValuePair&lt;int, int>(3, 4) }, 1).CollectAsMap()
var m = sc.Parallelize(new[] { new Tuple&lt;int, int>(1, 2), new Tuple&lt;int, int>(3, 4) }, 1).CollectAsMap()
m[1]
2
m[3]
@ -514,11 +514,11 @@
<param name="self"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.Keys``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}})">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.Keys``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}})">
<summary>
Return an RDD with the keys of each tuple.
>>> m = sc.Parallelize(new[] { new KeyValuePair&lt;int, int>(1, 2), new KeyValuePair&lt;int, int>(3, 4) }, 1).Keys().Collect()
>>> m = sc.Parallelize(new[] { new Tuple&lt;int, int>(1, 2), new Tuple&lt;int, int>(3, 4) }, 1).Keys().Collect()
[1, 3]
</summary>
<typeparam name="K"></typeparam>
@ -526,11 +526,11 @@
<param name="self"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.Values``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}})">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.Values``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}})">
<summary>
Return an RDD with the values of each tuple.
>>> m = sc.Parallelize(new[] { new KeyValuePair&lt;int, int>(1, 2), new KeyValuePair&lt;int, int>(3, 4) }, 1).Values().Collect()
>>> m = sc.Parallelize(new[] { new Tuple&lt;int, int>(1, 2), new Tuple&lt;int, int>(3, 4) }, 1).Values().Collect()
[2, 4]
</summary>
@ -539,7 +539,7 @@
<param name="self"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.ReduceByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1,``1,``1},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.ReduceByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Func{``1,``1,``1},System.Int32)">
<summary>
Merge the values for each key using an associative reduce function.
@ -551,9 +551,9 @@
sc.Parallelize(new[]
{
new KeyValuePair&lt;string, int>("a", 1),
new KeyValuePair&lt;string, int>("b", 1),
new KeyValuePair&lt;string, int>("a", 1)
new Tuple&lt;string, int>("a", 1),
new Tuple&lt;string, int>("b", 1),
new Tuple&lt;string, int>("a", 1)
}, 2)
.ReduceByKey((x, y) => x + y).Collect()
@ -567,7 +567,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.ReduceByKeyLocally``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1,``1,``1})">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.ReduceByKeyLocally``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Func{``1,``1,``1})">
<summary>
Merge the values for each key using an associative reduce function, but
return the results immediately to the master as a dictionary.
@ -577,9 +577,9 @@
sc.Parallelize(new[]
{
new KeyValuePair&lt;string, int>("a", 1),
new KeyValuePair&lt;string, int>("b", 1),
new KeyValuePair&lt;string, int>("a", 1)
new Tuple&lt;string, int>("a", 1),
new Tuple&lt;string, int>("b", 1),
new Tuple&lt;string, int>("a", 1)
}, 2)
.ReduceByKeyLocally((x, y) => x + y).Collect()
@ -592,15 +592,15 @@
<param name="reduceFunc"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.CountByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}})">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.CountByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}})">
<summary>
Count the number of elements for each key, and return the result to the master as a dictionary.
sc.Parallelize(new[]
{
new KeyValuePair&lt;string, int>("a", 1),
new KeyValuePair&lt;string, int>("b", 1),
new KeyValuePair&lt;string, int>("a", 1)
new Tuple&lt;string, int>("a", 1),
new Tuple&lt;string, int>("b", 1),
new Tuple&lt;string, int>("a", 1)
}, 2)
.CountByKey((x, y) => x + y).Collect()
@ -612,7 +612,7 @@
<param name="self"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.Join``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.Join``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
<summary>
Return an RDD containing all pairs of elements with matching keys in this RDD and <paramref name="other"/>.
@ -621,9 +621,9 @@
Performs a hash join across the cluster.
var l = sc.Parallelize(
new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 1);
new[] { new Tuple&lt;string, int>("a", 1), new Tuple&lt;string, int>("b", 4) }, 1);
var r = sc.Parallelize(
new[] { new KeyValuePair&lt;string, int>("a", 2), new KeyValuePair&lt;string, int>("a", 3) }, 1);
new[] { new Tuple&lt;string, int>("a", 2), new Tuple&lt;string, int>("a", 3) }, 1);
var m = l.Join(r, 2).Collect();
[('a', (1, 2)), ('a', (1, 3))]
@ -637,7 +637,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.LeftOuterJoin``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.LeftOuterJoin``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
<summary>
Perform a left outer join of this RDD and <paramref name="other"/>.
@ -648,9 +648,9 @@
Hash-partitions the resulting RDD into the given number of partitions.
var l = sc.Parallelize(
new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 1);
new[] { new Tuple&lt;string, int>("a", 1), new Tuple&lt;string, int>("b", 4) }, 1);
var r = sc.Parallelize(
new[] { new KeyValuePair&lt;string, int>("a", 2) }, 1);
new[] { new Tuple&lt;string, int>("a", 2) }, 1);
var m = l.LeftOuterJoin(r).Collect();
[('a', (1, 2)), ('b', (4, Option))]
@ -664,7 +664,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.RightOuterJoin``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.RightOuterJoin``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
<summary>
Perform a right outer join of this RDD and <paramref name="other"/>.
@ -675,9 +675,9 @@
Hash-partitions the resulting RDD into the given number of partitions.
var l = sc.Parallelize(
new[] { new KeyValuePair&lt;string, int>("a", 2) }, 1);
new[] { new Tuple&lt;string, int>("a", 2) }, 1);
var r = sc.Parallelize(
new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 1);
new[] { new Tuple&lt;string, int>("a", 1), new Tuple&lt;string, int>("b", 4) }, 1);
var m = l.RightOuterJoin(r).Collect();
[('a', (2, 1)), ('b', (Option, 4))]
@ -691,7 +691,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.FullOuterJoin``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.FullOuterJoin``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
<summary>
Perform a full outer join of this RDD and <paramref name="other"/>.
@ -706,9 +706,9 @@
Hash-partitions the resulting RDD into the given number of partitions.
var l = sc.Parallelize(
new[] { new KeyValuePair&lt;string, int>("a", 1), KeyValuePair&lt;string, int>("b", 4) }, 1);
new[] { new Tuple&lt;string, int>("a", 1), Tuple&lt;string, int>("b", 4) }, 1);
var r = sc.Parallelize(
new[] { new KeyValuePair&lt;string, int>("a", 2), new KeyValuePair&lt;string, int>("c", 8) }, 1);
new[] { new Tuple&lt;string, int>("a", 2), new Tuple&lt;string, int>("c", 8) }, 1);
var m = l.FullOuterJoin(r).Collect();
[('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))]
@ -722,18 +722,18 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.PartitionBy``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Int32,System.Func{System.Object,System.Int32})">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.PartitionBy``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Int32,System.Func{System.Object,System.Int32})">
<summary>
Return a copy of the RDD partitioned using the specified partitioner.
sc.Parallelize(new[] { 1, 2, 3, 4, 2, 4, 1 }, 1).Map(x => new KeyValuePair&lt;int, int>(x, x)).PartitionBy(3).Glom().Collect()
sc.Parallelize(new[] { 1, 2, 3, 4, 2, 4, 1 }, 1).Map(x => new Tuple&lt;int, int>(x, x)).PartitionBy(3).Glom().Collect()
</summary>
<param name="self"></param>
<param name="numPartitions"></param>
<param name="partitionFunc"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.CombineByKey``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``2},System.Func{``2,``1,``2},System.Func{``2,``2,``2},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.CombineByKey``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Func{``2},System.Func{``2,``1,``2},System.Func{``2,``2,``2},System.Int32)">
<summary>
# TODO: add control over map-side aggregation
Generic function to combine the elements for each key using a custom
@ -755,9 +755,9 @@
sc.Parallelize(
new[]
{
new KeyValuePair&lt;string, int>("a", 1),
new KeyValuePair&lt;string, int>("b", 1),
new KeyValuePair&lt;string, int>("a", 1)
new Tuple&lt;string, int>("a", 1),
new Tuple&lt;string, int>("b", 1),
new Tuple&lt;string, int>("a", 1)
}, 2)
.CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
@ -773,7 +773,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.AggregateByKey``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``2},System.Func{``2,``1,``2},System.Func{``2,``2,``2},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.AggregateByKey``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Func{``2},System.Func{``2,``1,``2},System.Func{``2,``2,``2},System.Int32)">
<summary>
Aggregate the values of each key, using given combine functions and a neutral
"zero value". This function can return a different result type, U, than the type
@ -786,9 +786,9 @@
sc.Parallelize(
new[]
{
new KeyValuePair&lt;string, int>("a", 1),
new KeyValuePair&lt;string, int>("b", 1),
new KeyValuePair&lt;string, int>("a", 1)
new Tuple&lt;string, int>("a", 1),
new Tuple&lt;string, int>("b", 1),
new Tuple&lt;string, int>("a", 1)
}, 2)
.CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
@ -804,7 +804,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.FoldByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1},System.Func{``1,``1,``1},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.FoldByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Func{``1},System.Func{``1,``1,``1},System.Int32)">
<summary>
Merge the values for each key using an associative function "func"
and a neutral "zeroValue" which may be added to the result an
@ -814,9 +814,9 @@
sc.Parallelize(
new[]
{
new KeyValuePair&lt;string, int>("a", 1),
new KeyValuePair&lt;string, int>("b", 1),
new KeyValuePair&lt;string, int>("a", 1)
new Tuple&lt;string, int>("a", 1),
new Tuple&lt;string, int>("b", 1),
new Tuple&lt;string, int>("a", 1)
}, 2)
.CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
@ -830,7 +830,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.GroupByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.GroupByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Int32)">
<summary>
Group the values for each key in the RDD into a single sequence.
Hash-partitions the resulting RDD with numPartitions partitions.
@ -842,9 +842,9 @@
sc.Parallelize(
new[]
{
new KeyValuePair&lt;string, int>("a", 1),
new KeyValuePair&lt;string, int>("b", 1),
new KeyValuePair&lt;string, int>("a", 1)
new Tuple&lt;string, int>("a", 1),
new Tuple&lt;string, int>("b", 1),
new Tuple&lt;string, int>("a", 1)
}, 2)
.GroupByKey().MapValues(l => string.Join(" ", l)).Collect()
@ -857,7 +857,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.MapValues``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1,``2})">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.MapValues``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Func{``1,``2})">
<summary>
Pass each value in the key-value pair RDD through a map function
without changing the keys; this also retains the original RDD's partitioning.
@ -865,8 +865,8 @@
sc.Parallelize(
new[]
{
new KeyValuePair&lt;string, string[]>("a", new[]{"apple", "banana", "lemon"}),
new KeyValuePair&lt;string, string[]>("b", new[]{"grapes"})
new Tuple&lt;string, string[]>("a", new[]{"apple", "banana", "lemon"}),
new Tuple&lt;string, string[]>("b", new[]{"grapes"})
}, 2)
.MapValues(x => x.Length).Collect()
@ -880,7 +880,7 @@
<param name="func"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.FlatMapValues``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1,System.Collections.Generic.IEnumerable{``2}})">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.FlatMapValues``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Func{``1,System.Collections.Generic.IEnumerable{``2}})">
<summary>
Pass each value in the key-value pair RDD through a flatMap function
without changing the keys; this also retains the original RDD's partitioning.
@ -888,8 +888,8 @@
x = sc.Parallelize(
new[]
{
new KeyValuePair&lt;string, string[]>("a", new[]{"x", "y", "z"}),
new KeyValuePair&lt;string, string[]>("b", new[]{"p", "r"})
new Tuple&lt;string, string[]>("a", new[]{"x", "y", "z"}),
new Tuple&lt;string, string[]>("b", new[]{"p", "r"})
}, 2)
.FlatMapValues(x => x).Collect()
@ -903,9 +903,9 @@
<param name="func"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.MapPartitionsWithIndex``5(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,System.Object}})">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.MapPartitionsWithIndex``5(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,System.Object}})">
<summary>
explicitly convert KeyValuePair&lt;K, V> to KeyValuePair&lt;K, dynamic>
explicitly convert Tuple&lt;K, V> to Tuple&lt;K, dynamic>
since they are incompatibles types unlike V to dynamic
</summary>
<typeparam name="K"></typeparam>
@ -916,13 +916,13 @@
<param name="self"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.GroupWith``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.GroupWith``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
<summary>
For each key k in this RDD or <paramref name="other"/>, return a resulting RDD that
contains a tuple with the list of values for that key in this RDD as well as <paramref name="other"/>.
var x = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 2);
var y = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 2) }, 1);
var x = sc.Parallelize(new[] { new Tuple&lt;string, int>("a", 1), new Tuple&lt;string, int>("b", 4) }, 2);
var y = sc.Parallelize(new[] { new Tuple&lt;string, int>("a", 2) }, 1);
x.GroupWith(y).Collect();
[('a', ([1], [2])), ('b', ([4], []))]
@ -936,11 +936,11 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.GroupWith``4(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``3}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.GroupWith``4(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``3}},System.Int32)">
<summary>
var x = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 5), new KeyValuePair&lt;string, int>("b", 6) }, 2);
var y = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 2);
var z = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 2) }, 1);
var x = sc.Parallelize(new[] { new Tuple&lt;string, int>("a", 5), new Tuple&lt;string, int>("b", 6) }, 2);
var y = sc.Parallelize(new[] { new Tuple&lt;string, int>("a", 1), new Tuple&lt;string, int>("b", 4) }, 2);
var z = sc.Parallelize(new[] { new Tuple&lt;string, int>("a", 2) }, 1);
x.GroupWith(y, z).Collect();
</summary>
<typeparam name="K"></typeparam>
@ -953,12 +953,12 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.GroupWith``5(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``3}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``4}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.GroupWith``5(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``3}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``4}},System.Int32)">
<summary>
var x = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 5), new KeyValuePair&lt;string, int>("b", 6) }, 2);
var y = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 2);
var z = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 2) }, 1);
var w = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("b", 42) }, 1);
var x = sc.Parallelize(new[] { new Tuple&lt;string, int>("a", 5), new Tuple&lt;string, int>("b", 6) }, 2);
var y = sc.Parallelize(new[] { new Tuple&lt;string, int>("a", 1), new Tuple&lt;string, int>("b", 4) }, 2);
var z = sc.Parallelize(new[] { new Tuple&lt;string, int>("a", 2) }, 1);
var w = sc.Parallelize(new[] { new Tuple&lt;string, int>("b", 42) }, 1);
var m = x.GroupWith(y, z, w).MapValues(l => string.Join(" ", l.Item1) + " : " + string.Join(" ", l.Item2) + " : " + string.Join(" ", l.Item3) + " : " + string.Join(" ", l.Item4)).Collect();
</summary>
<typeparam name="K"></typeparam>
@ -973,12 +973,12 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SubtractByKey``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SubtractByKey``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
<summary>
Return each (key, value) pair in this RDD that has no pair with matching key in <paramref name="other"/>.
var x = sc.Parallelize(new[] { new KeyValuePair&lt;string, int?>("a", 1), new KeyValuePair&lt;string, int?>("b", 4), new KeyValuePair&lt;string, int?>("b", 5), new KeyValuePair&lt;string, int?>("a", 2) }, 2);
var y = sc.Parallelize(new[] { new KeyValuePair&lt;string, int?>("a", 3), new KeyValuePair&lt;string, int?>("c", null) }, 2);
var x = sc.Parallelize(new[] { new Tuple&lt;string, int?>("a", 1), new Tuple&lt;string, int?>("b", 4), new Tuple&lt;string, int?>("b", 5), new Tuple&lt;string, int?>("a", 2) }, 2);
var y = sc.Parallelize(new[] { new Tuple&lt;string, int?>("a", 3), new Tuple&lt;string, int?>("c", null) }, 2);
x.SubtractByKey(y).Collect();
[('b', 4), ('b', 5)]
@ -992,14 +992,14 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.Lookup``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},``0)">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.Lookup``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},``0)">
<summary>
Return the list of values in the RDD for key `key`. This operation
is done efficiently if the RDD has a known partitioner by only
searching the partition that the key maps to.
>>> l = range(1000)
>>> rdd = sc.Parallelize(Enumerable.Range(0, 1000).Zip(Enumerable.Range(0, 1000), (x, y) => new KeyValuePair&lt;int, int>(x, y)), 10)
>>> rdd = sc.Parallelize(Enumerable.Range(0, 1000).Zip(Enumerable.Range(0, 1000), (x, y) => new Tuple&lt;int, int>(x, y)), 10)
>>> rdd.lookup(42)
[42]
@ -1010,7 +1010,7 @@
<param name="key"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsNewAPIHadoopDataset``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.String,System.String}})">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsNewAPIHadoopDataset``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
<summary>
Output a Python RDD of key-value pairs (of form RDD[(K, V)]) to any Hadoop file
system, using the new Hadoop OutputFormat API (mapreduce package). Keys/values are
@ -1022,7 +1022,7 @@
<param name="self"></param>
<param name="conf">Hadoop job configuration, passed in as a dict</param>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsNewAPIHadoopFile``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.String,System.String}})">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsNewAPIHadoopFile``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
<summary>
</summary>
@ -1035,7 +1035,7 @@
<param name="valueClass">fully qualified classname of value Writable class (e.g. "org.apache.hadoop.io.Text", None by default)</param>
<param name="conf">Hadoop job configuration, passed in as a dict (None by default)</param>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsHadoopDataset``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.String,System.String}})">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsHadoopDataset``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
<summary>
Output a Python RDD of key-value pairs (of form RDD[(K, V)]) to any Hadoop file
system, using the old Hadoop OutputFormat API (mapred package). Keys/values are
@ -1047,7 +1047,7 @@
<param name="self"></param>
<param name="conf">Hadoop job configuration, passed in as a dict</param>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsHadoopFile``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.String,System.String}},System.String)">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsHadoopFile``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}},System.String)">
<summary>
Output a Python RDD of key-value pairs (of form RDD[(K, V)]) to any Hadoop file
system, using the old Hadoop OutputFormat API (mapred package). Key and value types
@ -1066,7 +1066,7 @@
<param name="conf">(None by default)</param>
<param name="compressionCodecClass">(None by default)</param>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsSequenceFile``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.String,System.String)">
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsSequenceFile``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.String,System.String)">
<summary>
Output a Python RDD of key-value pairs (of form RDD[(K, V)]) to any Hadoop file
system, using the org.apache.hadoop.io.Writable types that we convert from the
@ -1169,6 +1169,11 @@
Indicates whether the RDD is checkpointed.
</summary>
</member>
<member name="P:Microsoft.Spark.CSharp.Core.RDD`1.SparkContext">
<summary>
Return the SparkContext that created this RDD
</summary>
</member>
<member name="P:Microsoft.Spark.CSharp.Core.RDD`1.IsCached">
<summary>
Return whether this RDD has been cached or not
@ -1231,7 +1236,7 @@
<summary>
Return a new RDD by applying a function to each element of this RDD.
sc.Parallelize(new string[]{"b", "a", "c"}, 1).Map(x => new KeyValuePair&lt;string, int>(x, 1)).Collect()
sc.Parallelize(new string[]{"b", "a", "c"}, 1).Map(x => new Tuple&lt;string, int>(x, 1)).Collect()
[('a', 1), ('b', 1), ('c', 1)]
</summary>
@ -2132,7 +2137,7 @@
Do
{{{
RDD&lt;KeyValuePair&lt;string, string>> rdd = sparkContext.WholeTextFiles("hdfs://a-hdfs-path")
RDD&lt;Tuple&lt;string, string>> rdd = sparkContext.WholeTextFiles("hdfs://a-hdfs-path")
}}}
then `rdd` contains
@ -2167,7 +2172,7 @@
}}}
Do
RDD&lt;KeyValuePair&lt;string, byte[]>>"/> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`,
RDD&lt;Tuple&lt;string, byte[]>>"/> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`,
then `rdd` contains
{{{
@ -2206,7 +2211,7 @@
<param name="minSplits">minimum splits in dataset (default min(2, sc.defaultParallelism))</param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.NewAPIHadoopFile(System.String,System.String,System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.String,System.String}})">
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.NewAPIHadoopFile(System.String,System.String,System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
<summary>
Read a 'new API' Hadoop InputFormat with arbitrary key and value class from HDFS,
a local file system (available on all nodes), or any Hadoop-supported file system URI.
@ -2224,7 +2229,7 @@
<param name="conf"> Hadoop configuration, passed in as a dict (None by default)</param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.NewAPIHadoopRDD(System.String,System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.String,System.String}})">
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.NewAPIHadoopRDD(System.String,System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
<summary>
Read a 'new API' Hadoop InputFormat with arbitrary key and value class, from an arbitrary
Hadoop configuration, which is passed in as a Python dict.
@ -2240,7 +2245,7 @@
<param name="conf">Hadoop configuration, passed in as a dict (None by default)</param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.HadoopFile(System.String,System.String,System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.String,System.String}})">
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.HadoopFile(System.String,System.String,System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
<summary>
Read an 'old' Hadoop InputFormat with arbitrary key and value class from HDFS,
a local file system (available on all nodes), or any Hadoop-supported file system URI.
@ -2258,7 +2263,7 @@
<param name="conf">Hadoop configuration, passed in as a dict (None by default)</param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.HadoopRDD(System.String,System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.String,System.String}})">
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.HadoopRDD(System.String,System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
<summary>
Read an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary
Hadoop configuration, which is passed in as a Python dict.
@ -2391,6 +2396,14 @@
</summary>
<param name="logLevel"></param>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.RunJob``1(Microsoft.Spark.CSharp.Core.RDD{``0},System.Collections.Generic.IEnumerable{System.Int32})">
<summary>
Run a job on a given set of partitions of an RDD.
</summary>
<typeparam name="T"></typeparam>
<param name="rdd"></param>
<param name="partitions"></param>
</member>
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.CancelJobGroup(System.String)">
<summary>
Cancel active jobs for the specified group. See <see cref="M:Microsoft.Spark.CSharp.Core.SparkContext.SetJobGroup(System.String,System.String,System.Boolean)"/> for more information.
@ -7662,6 +7675,171 @@
<param name="json">The Json object used to construct a StructType</param>
<returns>A new StructType instance</returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``1(System.String,System.Func{``0})">
<summary>
Register UDF with no input argument, e.g:
SqlContext.RegisterFunction&lt;bool>("MyFilter", () => true);
sqlContext.Sql("SELECT * FROM MyTable where MyFilter()");
</summary>
<typeparam name="RT"></typeparam>
<param name="name"></param>
<param name="f"></param>
</member>
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``2(System.String,System.Func{``1,``0})">
<summary>
Register UDF with 1 input argument, e.g:
SqlContext.RegisterFunction&lt;bool, string>("MyFilter", (arg1) => arg1 != null);
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1)");
</summary>
<typeparam name="RT"></typeparam>
<typeparam name="A1"></typeparam>
<param name="name"></param>
<param name="f"></param>
</member>
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``3(System.String,System.Func{``1,``2,``0})">
<summary>
Register UDF with 2 input arguments, e.g:
SqlContext.RegisterFunction&lt;bool, string, string>("MyFilter", (arg1, arg2) => arg1 != null &amp;&amp; arg2 != null);
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2)");
</summary>
<typeparam name="RT"></typeparam>
<typeparam name="A1"></typeparam>
<typeparam name="A2"></typeparam>
<param name="name"></param>
<param name="f"></param>
</member>
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``4(System.String,System.Func{``1,``2,``3,``0})">
<summary>
Register UDF with 3 input arguments, e.g:
SqlContext.RegisterFunction&lt;bool, string, string, string>("MyFilter", (arg1, arg2, arg3) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; arg3 != null);
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, columnName3)");
</summary>
<typeparam name="RT"></typeparam>
<typeparam name="A1"></typeparam>
<typeparam name="A2"></typeparam>
<typeparam name="A3"></typeparam>
<param name="name"></param>
<param name="f"></param>
</member>
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``5(System.String,System.Func{``1,``2,``3,``4,``0})">
<summary>
Register UDF with 4 input arguments, e.g:
SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg4) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg3 != null);
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName4)");
</summary>
<typeparam name="RT"></typeparam>
<typeparam name="A1"></typeparam>
<typeparam name="A2"></typeparam>
<typeparam name="A3"></typeparam>
<typeparam name="A4"></typeparam>
<param name="name"></param>
<param name="f"></param>
</member>
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``6(System.String,System.Func{``1,``2,``3,``4,``5,``0})">
<summary>
Register UDF with 5 input arguments, e.g:
SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg5) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg5 != null);
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName5)");
</summary>
<typeparam name="RT"></typeparam>
<typeparam name="A1"></typeparam>
<typeparam name="A2"></typeparam>
<typeparam name="A3"></typeparam>
<typeparam name="A4"></typeparam>
<typeparam name="A5"></typeparam>
<param name="name"></param>
<param name="f"></param>
</member>
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``7(System.String,System.Func{``1,``2,``3,``4,``5,``6,``0})">
<summary>
Register UDF with 6 input arguments, e.g:
SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg6) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg6 != null);
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName6)");
</summary>
<typeparam name="RT"></typeparam>
<typeparam name="A1"></typeparam>
<typeparam name="A2"></typeparam>
<typeparam name="A3"></typeparam>
<typeparam name="A4"></typeparam>
<typeparam name="A5"></typeparam>
<typeparam name="A6"></typeparam>
<param name="name"></param>
<param name="f"></param>
</member>
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``8(System.String,System.Func{``1,``2,``3,``4,``5,``6,``7,``0})">
<summary>
Register UDF with 7 input arguments, e.g:
SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg7) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg7 != null);
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName7)");
</summary>
<typeparam name="RT"></typeparam>
<typeparam name="A1"></typeparam>
<typeparam name="A2"></typeparam>
<typeparam name="A3"></typeparam>
<typeparam name="A4"></typeparam>
<typeparam name="A5"></typeparam>
<typeparam name="A6"></typeparam>
<typeparam name="A7"></typeparam>
<param name="name"></param>
<param name="f"></param>
</member>
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``9(System.String,System.Func{``1,``2,``3,``4,``5,``6,``7,``8,``0})">
<summary>
Register UDF with 8 input arguments, e.g:
SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg8) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg8 != null);
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName8)");
</summary>
<typeparam name="RT"></typeparam>
<typeparam name="A1"></typeparam>
<typeparam name="A2"></typeparam>
<typeparam name="A3"></typeparam>
<typeparam name="A4"></typeparam>
<typeparam name="A5"></typeparam>
<typeparam name="A6"></typeparam>
<typeparam name="A7"></typeparam>
<typeparam name="A8"></typeparam>
<param name="name"></param>
<param name="f"></param>
</member>
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``10(System.String,System.Func{``1,``2,``3,``4,``5,``6,``7,``8,``9,``0})">
<summary>
Register UDF with 9 input arguments, e.g:
SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg9) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg9 != null);
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName9)");
</summary>
<typeparam name="RT"></typeparam>
<typeparam name="A1"></typeparam>
<typeparam name="A2"></typeparam>
<typeparam name="A3"></typeparam>
<typeparam name="A4"></typeparam>
<typeparam name="A5"></typeparam>
<typeparam name="A6"></typeparam>
<typeparam name="A7"></typeparam>
<typeparam name="A8"></typeparam>
<typeparam name="A9"></typeparam>
<param name="name"></param>
<param name="f"></param>
</member>
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``11(System.String,System.Func{``1,``2,``3,``4,``5,``6,``7,``8,``9,``10,``0})">
<summary>
Register UDF with 10 input arguments, e.g:
SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg10) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg10 != null);
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName10)");
</summary>
<typeparam name="RT"></typeparam>
<typeparam name="A1"></typeparam>
<typeparam name="A2"></typeparam>
<typeparam name="A3"></typeparam>
<typeparam name="A4"></typeparam>
<typeparam name="A5"></typeparam>
<typeparam name="A6"></typeparam>
<typeparam name="A7"></typeparam>
<typeparam name="A8"></typeparam>
<typeparam name="A9"></typeparam>
<typeparam name="A10"></typeparam>
<param name="name"></param>
<param name="f"></param>
</member>
<member name="T:Microsoft.Spark.CSharp.Streaming.ConstantInputDStream`1">
<summary>
An input stream that always returns the same RDD on each timestep. Useful for testing.
@ -7968,7 +8146,7 @@
Utility for creating streams from
</summary>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.EventHubsUtils.CreateUnionStream(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.Dictionary{System.String,System.String},Microsoft.Spark.CSharp.Core.StorageLevelType)">
<member name="M:Microsoft.Spark.CSharp.Streaming.EventHubsUtils.CreateUnionStream(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}},Microsoft.Spark.CSharp.Core.StorageLevelType)">
<summary>
Create a unioned EventHubs stream that receives data from Microsoft Azure Eventhubs
The unioned stream will receive message from all partitions of the EventHubs
@ -7998,7 +8176,7 @@
Utils for Kafka input stream.
</summary>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateStream(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.String,System.String,System.Collections.Generic.Dictionary{System.String,System.Int32},System.Collections.Generic.Dictionary{System.String,System.String})">
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateStream(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.String,System.String,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.Int32}},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
<summary>
Create an input stream that pulls messages from a Kafka Broker.
</summary>
@ -8009,7 +8187,7 @@
<param name="kafkaParams">Additional params for Kafka</param>
<returns>A DStream object</returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateStream(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.String,System.String,System.Collections.Generic.Dictionary{System.String,System.Int32},System.Collections.Generic.Dictionary{System.String,System.String},Microsoft.Spark.CSharp.Core.StorageLevelType)">
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateStream(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.String,System.String,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.Int32}},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}},Microsoft.Spark.CSharp.Core.StorageLevelType)">
<summary>
Create an input stream that pulls messages from a Kafka Broker.
</summary>
@ -8021,7 +8199,7 @@
<param name="storageLevelType">RDD storage level.</param>
<returns>A DStream object</returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateDirectStream(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.List{System.String},System.Collections.Generic.Dictionary{System.String,System.String},System.Collections.Generic.Dictionary{System.String,System.Int64})">
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateDirectStream(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.List{System.String},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.Int64}})">
<summary>
Create an input stream that directly pulls messages from a Kafka Broker and specific offset.
@ -8047,7 +8225,7 @@
<param name="fromOffsets">Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream.</param>
<returns>A DStream object</returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateDirectStream``1(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.List{System.String},System.Collections.Generic.Dictionary{System.String,System.String},System.Collections.Generic.Dictionary{System.String,System.Int64},System.Func{System.Int32,System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.Byte[],System.Byte[]}},System.Collections.Generic.IEnumerable{``0}})">
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateDirectStream``1(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.List{System.String},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.Int64}},System.Func{System.Int32,System.Collections.Generic.IEnumerable{System.Tuple{System.Byte[],System.Byte[]}},System.Collections.Generic.IEnumerable{``0}})">
<summary>
Create an input stream that directly pulls messages from a Kafka Broker and specific offset.
@ -8074,14 +8252,14 @@
<param name="readFunc">user function to process the kafka data.</param>
<returns>A DStream object</returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.GetOffsetRange(System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.Byte[],System.Byte[]}})">
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.GetOffsetRange(System.Collections.Generic.IEnumerable{System.Tuple{System.Byte[],System.Byte[]}})">
<summary>
create offset range from kafka messages when CSharpReader is enabled
</summary>
<param name="input"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.GetNumPartitionsFromConfig(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.List{System.String},System.Collections.Generic.Dictionary{System.String,System.String})">
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.GetNumPartitionsFromConfig(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.List{System.String},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
<summary>
topics should contain only one topic if choose to repartitions to a configured numPartitions
TODO: move to scala and merge into DynamicPartitionKafkaRDD.getPartitions to remove above limitation
@ -8202,7 +8380,7 @@
<param name="idleDuration">The idle time of duration</param>
<returns>The new StateSpec object</returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.StateSpec`4.InitialState(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{`0,`2}})">
<member name="M:Microsoft.Spark.CSharp.Streaming.StateSpec`4.InitialState(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{`0,`2}})">
<summary>
Set the RDD containing the initial states that will be used by mapWithState
</summary>
@ -8249,10 +8427,10 @@
</member>
<member name="T:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions">
<summary>
operations only available to KeyValuePair RDD
operations only available to Tuple RDD
</summary>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.ReduceByKey``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1,``1,``1},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.ReduceByKey``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Func{``1,``1,``1},System.Int32)">
<summary>
Return a new DStream by applying ReduceByKey to each RDD.
</summary>
@ -8263,7 +8441,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.CombineByKey``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``2},System.Func{``2,``1,``2},System.Func{``2,``2,``2},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.CombineByKey``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Func{``2},System.Func{``2,``1,``2},System.Func{``2,``2,``2},System.Int32)">
<summary>
Return a new DStream by applying combineByKey to each RDD.
</summary>
@ -8277,7 +8455,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.PartitionBy``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.PartitionBy``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Int32)">
<summary>
Return a new DStream in which each RDD are partitioned by numPartitions.
</summary>
@ -8287,7 +8465,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.MapValues``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1,``2})">
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.MapValues``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Func{``1,``2})">
<summary>
Return a new DStream by applying a map function to the value of
each key-value pairs in this DStream without changing the key.
@ -8299,7 +8477,7 @@
<param name="func"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.FlatMapValues``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1,System.Collections.Generic.IEnumerable{``2}})">
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.FlatMapValues``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Func{``1,System.Collections.Generic.IEnumerable{``2}})">
<summary>
Return a new DStream by applying a flatmap function to the value
of each key-value pairs in this DStream without changing the key.
@ -8311,7 +8489,7 @@
<param name="func"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.GroupByKey``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.GroupByKey``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Int32)">
<summary>
Return a new DStream by applying groupByKey on each RDD.
</summary>
@ -8321,7 +8499,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.GroupWith``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.GroupWith``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``2}},System.Int32)">
<summary>
Return a new DStream by applying 'cogroup' between RDDs of this DStream and `other` DStream.
Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
@ -8334,7 +8512,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.Join``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.Join``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``2}},System.Int32)">
<summary>
Return a new DStream by applying 'join' between RDDs of this DStream and `other` DStream.
Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
@ -8347,7 +8525,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.LeftOuterJoin``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.LeftOuterJoin``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``2}},System.Int32)">
<summary>
Return a new DStream by applying 'left outer join' between RDDs of this DStream and `other` DStream.
Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
@ -8360,7 +8538,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.RightOuterJoin``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.RightOuterJoin``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``2}},System.Int32)">
<summary>
Return a new DStream by applying 'right outer join' between RDDs of this DStream and `other` DStream.
Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
@ -8373,7 +8551,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.FullOuterJoin``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.FullOuterJoin``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``2}},System.Int32)">
<summary>
Return a new DStream by applying 'full outer join' between RDDs of this DStream and `other` DStream.
Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
@ -8386,7 +8564,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.GroupByKeyAndWindow``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Int32,System.Int32,System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.GroupByKeyAndWindow``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Int32,System.Int32,System.Int32)">
<summary>
Return a new DStream by applying `GroupByKey` over a sliding window.
Similar to `DStream.GroupByKey()`, but applies it over a sliding window.
@ -8403,7 +8581,7 @@
<param name="numPartitions">Number of partitions of each RDD in the new DStream.</param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.ReduceByKeyAndWindow``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1,``1,``1},System.Func{``1,``1,``1},System.Int32,System.Int32,System.Int32,System.Func{System.Collections.Generic.KeyValuePair{``0,``1},System.Boolean})">
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.ReduceByKeyAndWindow``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Func{``1,``1,``1},System.Func{``1,``1,``1},System.Int32,System.Int32,System.Int32,System.Func{System.Tuple{``0,``1},System.Boolean})">
<summary>
Return a new DStream by applying incremental `reduceByKey` over a sliding window.
@ -8424,7 +8602,7 @@
<param name="filterFunc">function to filter expired key-value pairs; only pairs that satisfy the function are retained set this to null if you do not want to filter</param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.UpdateStateByKey``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{System.Collections.Generic.IEnumerable{``1},``2,``2},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.UpdateStateByKey``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Func{System.Collections.Generic.IEnumerable{``1},``2,``2},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
<summary>
Return a new "state" DStream where the state for each key is updated by applying
the given function on the previous state of the key and the new values of the key.
@ -8441,7 +8619,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.UpdateStateByKey``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{``0,System.Tuple{System.Collections.Generic.IEnumerable{``1},``2}}},System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{``0,``2}}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.UpdateStateByKey``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Func{System.Collections.Generic.IEnumerable{System.Tuple{``0,System.Tuple{System.Collections.Generic.IEnumerable{``1},``2}}},System.Collections.Generic.IEnumerable{System.Tuple{``0,``2}}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
<summary>
Return a new "state" DStream where the state for each key is updated by applying
the given function on the previous state of the key and the new values of the key.
@ -8455,7 +8633,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.UpdateStateByKey``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{System.Int32,System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{``0,System.Tuple{System.Collections.Generic.IEnumerable{``1},``2}}},System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{``0,``2}}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.UpdateStateByKey``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Func{System.Int32,System.Collections.Generic.IEnumerable{System.Tuple{``0,System.Tuple{System.Collections.Generic.IEnumerable{``1},``2}}},System.Collections.Generic.IEnumerable{System.Tuple{``0,``2}}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
<summary>
Return a new "state" DStream where the state for each key is updated by applying
the given function on the previous state of the key and the new values of the key.
@ -8469,7 +8647,7 @@
<param name="numPartitions"></param>
<returns></returns>
</member>
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.MapWithState``4(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Streaming.StateSpec{``0,``1,``2,``3})">
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.MapWithState``4(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Streaming.StateSpec{``0,``1,``2,``3})">
<summary>
Return a new "state" DStream where the state for each key is updated by applying
the given function on the previous state of the key and the new values of the key.

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -1,4 +1,5 @@
using System.IO;
using System;
using System.IO;
using System.Collections.Generic;
using System.Net;
using System.Runtime.Serialization.Formatters.Binary;
@ -76,7 +77,7 @@ namespace AdapterTest
// write update
int key = 0;
int value = 100;
KeyValuePair<int, dynamic> update = new KeyValuePair<int, dynamic>(key, value);
Tuple<int, dynamic> update = new Tuple<int, dynamic>(key, value);
var ms = new MemoryStream();
var formatter = new BinaryFormatter();
formatter.Serialize(ms, update);
@ -107,7 +108,7 @@ namespace AdapterTest
// write update
int key = 1;
int value = 1000;
KeyValuePair<int, dynamic> update = new KeyValuePair<int, dynamic>(key, value);
Tuple<int, dynamic> update = new Tuple<int, dynamic>(key, value);
var ms = new MemoryStream();
var formatter = new BinaryFormatter();
formatter.Serialize(ms, update);
@ -119,8 +120,8 @@ namespace AdapterTest
byte[] receiveBuffer = new byte[1];
s.Read(receiveBuffer, 0, 1);
Assert.IsTrue(Accumulator.accumulatorRegistry.ContainsKey(update.Key));
var accumulator = Accumulator.accumulatorRegistry[update.Key] as Accumulator<int>;
Assert.IsTrue(Accumulator.accumulatorRegistry.ContainsKey(update.Item1));
var accumulator = Accumulator.accumulatorRegistry[update.Item1] as Accumulator<int>;
Assert.AreEqual(accumulator.Value, value);
}
}

Просмотреть файл

@ -119,6 +119,7 @@
<Compile Include="PairRDDTest.cs" />
<Compile Include="ComparableRDDTest.cs" />
<Compile Include="DoubleRDDTest.cs" />
<Compile Include="UdfRegistrationTest.cs" />
<Compile Include="UserDefinedFunctionTest.cs" />
<Compile Include="WeakObjectManagerTest.cs" />
</ItemGroup>

Просмотреть файл

@ -1,4 +1,6 @@
using System;
using System.Linq;
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Sql;
using NUnit.Framework;
@ -46,5 +48,13 @@ namespace AdapterTest
builder.Config("doublevalue", 3.5D);
Assert.True(builder.options["doublevalue"].Equals("3.5", StringComparison.InvariantCultureIgnoreCase));
}
[Test]
public void TestEnableHiveSupport()
{
var builder = new Builder();
builder.EnableHiveSupport();
Assert.True(builder.options["spark.sql.catalogImplementation"].Equals("hive", StringComparison.InvariantCultureIgnoreCase));
}
}
}

Просмотреть файл

@ -48,8 +48,8 @@ namespace AdapterTest
foreach (object record in taken)
{
KeyValuePair<string, long> countByWord = (KeyValuePair<string, long>)record;
Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22);
Tuple<string, long> countByWord = (Tuple<string, long>)record;
Assert.AreEqual(countByWord.Item2, countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 23 : 22);
}
});
@ -91,7 +91,7 @@ namespace AdapterTest
var words = lines.FlatMap(l => l.Split(' '));
var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));
var pairs = words.Map(w => new Tuple<string, int>(w, 1));
var wordCounts = pairs.PartitionBy().ReduceByKey((x, y) => x + y);
@ -102,8 +102,8 @@ namespace AdapterTest
foreach (object record in taken)
{
KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22);
Tuple<string, int> countByWord = (Tuple<string, int>)record;
Assert.AreEqual(countByWord.Item2, countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 23 : 22);
}
});
@ -116,8 +116,8 @@ namespace AdapterTest
foreach (object record in taken)
{
KeyValuePair<string, List<int>> countByWord = (KeyValuePair<string, List<int>>)record;
Assert.AreEqual(countByWord.Value.Count, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22);
Tuple<string, List<int>> countByWord = (Tuple<string, List<int>>)record;
Assert.AreEqual(countByWord.Item2.Count, countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 23 : 22);
}
});
@ -130,8 +130,8 @@ namespace AdapterTest
foreach (object record in taken)
{
KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 46 : 44);
Tuple<string, int> countByWord = (Tuple<string, int>)record;
Assert.AreEqual(countByWord.Item2, countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 46 : 44);
}
});
}
@ -147,12 +147,12 @@ namespace AdapterTest
var words = lines.FlatMap(l => l.Split(' '));
var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));
var pairs = words.Map(w => new Tuple<string, int>(w, 1));
var wordCounts = pairs.ReduceByKey((x, y) => x + y);
var left = wordCounts.Filter(x => x.Key != "quick" && x.Key != "lazy");
var right = wordCounts.Filter(x => x.Key != "brown");
var left = wordCounts.Filter(x => x.Item1 != "quick" && x.Item1 != "lazy");
var right = wordCounts.Filter(x => x.Item1 != "brown");
var groupWith = left.GroupWith(right);
groupWith.ForeachRDD((time, rdd) =>
@ -162,15 +162,15 @@ namespace AdapterTest
foreach (object record in taken)
{
KeyValuePair<string, Tuple<List<int>, List<int>>> countByWord = (KeyValuePair<string, Tuple<List<int>, List<int>>>)record;
if (countByWord.Key == "quick" || countByWord.Key == "lazy")
Assert.AreEqual(countByWord.Value.Item1.Count, 0);
else if (countByWord.Key == "brown")
Assert.AreEqual(countByWord.Value.Item2.Count, 0);
Tuple<string, Tuple<List<int>, List<int>>> countByWord = (Tuple<string, Tuple<List<int>, List<int>>>)record;
if (countByWord.Item1 == "quick" || countByWord.Item1 == "lazy")
Assert.AreEqual(countByWord.Item2.Item1.Count, 0);
else if (countByWord.Item1 == "brown")
Assert.AreEqual(countByWord.Item2.Item2.Count, 0);
else
{
Assert.AreEqual(countByWord.Value.Item1[0], countByWord.Key == "The" || countByWord.Key == "dog" ? 23 : 22);
Assert.AreEqual(countByWord.Value.Item2[0], countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22);
Assert.AreEqual(countByWord.Item2.Item1[0], countByWord.Item1 == "The" || countByWord.Item1 == "dog" ? 23 : 22);
Assert.AreEqual(countByWord.Item2.Item2[0], countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 23 : 22);
}
}
});
@ -183,9 +183,9 @@ namespace AdapterTest
foreach (object record in taken)
{
KeyValuePair<string, Tuple<int, int>> countByWord = (KeyValuePair<string, Tuple<int, int>>)record;
Assert.AreEqual(countByWord.Value.Item1, countByWord.Key == "The" || countByWord.Key == "dog" ? 23 : 22);
Assert.AreEqual(countByWord.Value.Item2, countByWord.Key == "The" || countByWord.Key == "dog" ? 23 : 22);
Tuple<string, Tuple<int, int>> countByWord = (Tuple<string, Tuple<int, int>>)record;
Assert.AreEqual(countByWord.Item2.Item1, countByWord.Item1 == "The" || countByWord.Item1 == "dog" ? 23 : 22);
Assert.AreEqual(countByWord.Item2.Item2, countByWord.Item1 == "The" || countByWord.Item1 == "dog" ? 23 : 22);
}
});
@ -197,11 +197,11 @@ namespace AdapterTest
foreach (object record in taken)
{
KeyValuePair<string, Tuple<int, Option<int>>> countByWord = (KeyValuePair<string, Tuple<int, Option<int>>>)record;
Assert.AreEqual(countByWord.Value.Item1, countByWord.Key == "The" || countByWord.Key == "dog" ? 23 : 22);
Assert.IsTrue(countByWord.Key == "The" || countByWord.Key == "dog" ?
countByWord.Value.Item2.IsDefined == true && countByWord.Value.Item2.GetValue() == 23 : (countByWord.Key == "brown" ?
countByWord.Value.Item2.IsDefined == true == false : countByWord.Value.Item2.IsDefined == true && countByWord.Value.Item2.GetValue() == 22));
Tuple<string, Tuple<int, Option<int>>> countByWord = (Tuple<string, Tuple<int, Option<int>>>)record;
Assert.AreEqual(countByWord.Item2.Item1, countByWord.Item1 == "The" || countByWord.Item1 == "dog" ? 23 : 22);
Assert.IsTrue(countByWord.Item1 == "The" || countByWord.Item1 == "dog" ?
countByWord.Item2.Item2.IsDefined == true && countByWord.Item2.Item2.GetValue() == 23 : (countByWord.Item1 == "brown" ?
countByWord.Item2.Item2.IsDefined == true == false : countByWord.Item2.Item2.IsDefined == true && countByWord.Item2.Item2.GetValue() == 22));
}
});
@ -213,12 +213,12 @@ namespace AdapterTest
foreach (object record in taken)
{
KeyValuePair<string, Tuple<Option<int>, int>> countByWord = (KeyValuePair<string, Tuple<Option<int>, int>>)record;
Assert.IsTrue(countByWord.Key == "The" || countByWord.Key == "dog" ?
countByWord.Value.Item1.IsDefined == true && countByWord.Value.Item1.GetValue() == 23 :
(countByWord.Key == "quick" || countByWord.Key == "lazy" ? countByWord.Value.Item1.IsDefined == false :
countByWord.Value.Item1.IsDefined == true && countByWord.Value.Item1.GetValue() == 22));
Assert.AreEqual(countByWord.Value.Item2, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22);
Tuple<string, Tuple<Option<int>, int>> countByWord = (Tuple<string, Tuple<Option<int>, int>>)record;
Assert.IsTrue(countByWord.Item1 == "The" || countByWord.Item1 == "dog" ?
countByWord.Item2.Item1.IsDefined == true && countByWord.Item2.Item1.GetValue() == 23 :
(countByWord.Item1 == "quick" || countByWord.Item1 == "lazy" ? countByWord.Item2.Item1.IsDefined == false :
countByWord.Item2.Item1.IsDefined == true && countByWord.Item2.Item1.GetValue() == 22));
Assert.AreEqual(countByWord.Item2.Item2, countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 23 : 22);
}
});
@ -230,15 +230,15 @@ namespace AdapterTest
foreach (object record in taken)
{
KeyValuePair<string, Tuple<Option<int>, Option<int>>> countByWord = (KeyValuePair<string, Tuple<Option<int>, Option<int>>>)record;
Assert.IsTrue(countByWord.Key == "The" || countByWord.Key == "dog" ?
countByWord.Value.Item1.IsDefined == true && countByWord.Value.Item1.GetValue() == 23 :
(countByWord.Key == "quick" || countByWord.Key == "lazy" ? countByWord.Value.Item1.IsDefined == false :
countByWord.Value.Item1.IsDefined == true && countByWord.Value.Item1.GetValue() == 22));
Tuple<string, Tuple<Option<int>, Option<int>>> countByWord = (Tuple<string, Tuple<Option<int>, Option<int>>>)record;
Assert.IsTrue(countByWord.Item1 == "The" || countByWord.Item1 == "dog" ?
countByWord.Item2.Item1.IsDefined == true && countByWord.Item2.Item1.GetValue() == 23 :
(countByWord.Item1 == "quick" || countByWord.Item1 == "lazy" ? countByWord.Item2.Item1.IsDefined == false :
countByWord.Item2.Item1.IsDefined == true && countByWord.Item2.Item1.GetValue() == 22));
Assert.IsTrue(countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ?
countByWord.Value.Item2.IsDefined == true && countByWord.Value.Item2.GetValue() == 23 :
(countByWord.Key == "brown" ? countByWord.Value.Item2.IsDefined == false : countByWord.Value.Item2.IsDefined == true && countByWord.Value.Item2.GetValue() == 22));
Assert.IsTrue(countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ?
countByWord.Item2.Item2.IsDefined == true && countByWord.Item2.Item2.GetValue() == 23 :
(countByWord.Item1 == "brown" ? countByWord.Item2.Item2.IsDefined == false : countByWord.Item2.Item2.IsDefined == true && countByWord.Item2.Item2.GetValue() == 22));
}
});
}
@ -254,7 +254,7 @@ namespace AdapterTest
var words = lines.FlatMap(l => l.Split(' '));
var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));
var pairs = words.Map(w => new Tuple<string, int>(w, 1));
var doubleCounts = pairs.GroupByKey().FlatMapValues(vs => vs).MapValues(v => 2 * v).ReduceByKey((x, y) => x + y);
doubleCounts.ForeachRDD((time, rdd) =>
@ -264,15 +264,15 @@ namespace AdapterTest
foreach (object record in taken)
{
KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 2 * 23 : 2 * 22);
Tuple<string, int> countByWord = (Tuple<string, int>)record;
Assert.AreEqual(countByWord.Item2, countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 2 * 23 : 2 * 22);
}
});
// disable pipeline to UpdateStateByKey which replys on checkpoint mock proxy doesn't support
pairs.Cache();
var initialStateRdd = ssc.SparkContext.Parallelize(new[] { "AAA" }).Map( w => new KeyValuePair<string, int>("AAA", 22));
var initialStateRdd = ssc.SparkContext.Parallelize(new[] { "AAA" }).Map( w => new Tuple<string, int>("AAA", 22));
var state = pairs.UpdateStateByKey<string, int, int>((v, s) => s + (v as List<int>).Count, initialStateRdd);
state.ForeachRDD((time, rdd) =>
{
@ -281,8 +281,8 @@ namespace AdapterTest
foreach (object record in taken)
{
KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
Assert.AreEqual(countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22, countByWord.Value);
Tuple<string, int> countByWord = (Tuple<string, int>)record;
Assert.AreEqual(countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 23 : 22, countByWord.Item2);
}
});
@ -295,8 +295,8 @@ namespace AdapterTest
foreach (object record in taken)
{
KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
Assert.AreEqual(countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22, countByWord.Value);
Tuple<string, int> countByWord = (Tuple<string, int>)record;
Assert.AreEqual(countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 23 : 22, countByWord.Item2);
}
});
}
@ -330,7 +330,7 @@ namespace AdapterTest
var ssc = new StreamingContext(new SparkContext(sparkContextProxy.Object, sparkConf), 10000L);
var dstreamProxy = new Mock<IDStreamProxy>();
var pairDStream = new DStream<KeyValuePair<string, int>>(dstreamProxy.Object, ssc);
var pairDStream = new DStream<Tuple<string, int>>(dstreamProxy.Object, ssc);
var stateSpec = new StateSpec<string, int, int, int>((k, v, s) => v);
var stateDStream = pairDStream.MapWithState(stateSpec);
@ -373,7 +373,7 @@ namespace AdapterTest
Assert.IsNotNull(resultRdd);
// test when initialStateRdd is not null
var initialStateRdd = new RDD<KeyValuePair<string, int>>(new Mock<IRDDProxy>().Object, null);
var initialStateRdd = new RDD<Tuple<string, int>>(new Mock<IRDDProxy>().Object, null);
var stateSpec2 = new StateSpec<string, int, int, int>((k, v, s) => v).InitialState(initialStateRdd).NumPartitions(2);
var helper2 = new MapWithStateHelper<string, int, int, int>((t, rdd) => rdd, stateSpec2);
@ -404,13 +404,13 @@ namespace AdapterTest
var input = new dynamic[4];
var preStateRddRecord = new MapWithStateRDDRecord<string, int, int>(ticks - TimeSpan.FromSeconds(2).Ticks, new [] { new KeyValuePair<string, int>("1", 1), new KeyValuePair<string, int>("2", 2)});
var preStateRddRecord = new MapWithStateRDDRecord<string, int, int>(ticks - TimeSpan.FromSeconds(2).Ticks, new [] { new Tuple<string, int>("1", 1), new Tuple<string, int>("2", 2)});
preStateRddRecord.stateMap.Add("expired", new KeyedState<int>(0, ticks - TimeSpan.FromSeconds(60).Ticks));
input[0] = preStateRddRecord;
input[1] = new KeyValuePair<string, int>("1", -1);
input[2] = new KeyValuePair<string, int>("2", 2);
input[3] = new KeyValuePair<string, int>("3", 3);
input[1] = new Tuple<string, int>("1", -1);
input[2] = new Tuple<string, int>("2", 2);
input[3] = new Tuple<string, int>("3", 3);
var result = helper.Execute(1, input).GetEnumerator();
Assert.IsNotNull(result);

Просмотреть файл

@ -19,7 +19,7 @@ namespace AdapterTest
var sparkContext = new SparkContext(null);
var lines = sparkContext.TextFile(Path.GetTempFileName());
var words = lines.FlatMap(l => l.Split(' '));
doubles = words.Map(w => new KeyValuePair<string, int>(w, 1)).ReduceByKey((x, y) => x + y).Map(kv => (double)kv.Value);
doubles = words.Map(w => new Tuple<string, int>(w, 1)).ReduceByKey((x, y) => x + y).Map(kv => (double)kv.Item2);
}
[Test]

Просмотреть файл

@ -22,7 +22,7 @@ namespace AdapterTest
var streamingContextProxy = new Mock<IStreamingContextProxy>();
var mockDstreamProxy = new Mock<IDStreamProxy>().Object;
streamingContextProxy.Setup(
m => m.EventHubsUnionStream(It.IsAny<Dictionary<string, string>>(), It.IsAny<StorageLevelType>()))
m => m.EventHubsUnionStream(It.IsAny<IEnumerable<Tuple<string, string>>>(), It.IsAny<StorageLevelType>()))
.Returns(mockDstreamProxy);
var mockSparkClrProxy = new Mock<ISparkCLRProxy>();
@ -32,7 +32,7 @@ namespace AdapterTest
var sparkContext = new SparkContext(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy, new SparkConf(new Mock<ISparkConfProxy>().Object));
var streamingContext = new StreamingContext(sparkContext, 123L);
var dstream = EventHubsUtils.CreateUnionStream(streamingContext, new Dictionary<string, string>());
var dstream = EventHubsUtils.CreateUnionStream(streamingContext, new List<Tuple<string, string>>());
Assert.AreEqual(mockDstreamProxy, dstream.DStreamProxy);
}
}

Просмотреть файл

@ -137,7 +137,7 @@ namespace AdapterTest.Mocks
return this;
}
public IRDDProxy SampleByKey(bool withReplacement, Dictionary<string, double> fractions, long seed)
public IRDDProxy SampleByKey(bool withReplacement, IEnumerable<Tuple<string, double>> fractions, long seed)
{
return this;
}
@ -152,13 +152,13 @@ namespace AdapterTest.Mocks
return null;
}
public void SaveAsNewAPIHadoopDataset(IEnumerable<KeyValuePair<string, string>> conf)
public void SaveAsNewAPIHadoopDataset(IEnumerable<Tuple<string, string>> conf)
{ }
public void SaveAsNewAPIHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf)
public void SaveAsNewAPIHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<Tuple<string, string>> conf)
{ }
public void SaveAsHadoopDataset(IEnumerable<KeyValuePair<string, string>> conf)
public void SaveAsHadoopDataset(IEnumerable<Tuple<string, string>> conf)
{ }
public void SaveAsSequenceFile(string path, string compressionCodecClass)
@ -168,7 +168,7 @@ namespace AdapterTest.Mocks
{ }
public void SaveAsHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf, string compressionCodecClass)
public void SaveAsHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<Tuple<string, string>> conf, string compressionCodecClass)
{ }

Просмотреть файл

@ -135,22 +135,22 @@ namespace AdapterTest.Mocks
return new MockRddProxy(null);
}
public IRDDProxy NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
public IRDDProxy NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize)
{
return new MockRddProxy(null);
}
public IRDDProxy NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
public IRDDProxy NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize)
{
return new MockRddProxy(null);
}
public IRDDProxy HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
public IRDDProxy HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize)
{
return new MockRddProxy(null);
}
public IRDDProxy HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
public IRDDProxy HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize)
{
return new MockRddProxy(null);
}

Просмотреть файл

@ -13,7 +13,7 @@ namespace AdapterTest.Mocks
class MockSparkSessionProxy : ISparkSessionProxy
{
public ISqlContextProxy SqlContextProxy { get { return new MockSqlContextProxy(new MockSparkContextProxy(new MockSparkConfProxy()));} }
public IUdfRegistration Udf { get; }
public IUdfRegistrationProxy Udf { get; }
public ICatalogProxy GetCatalog()
{
throw new NotImplementedException();

Просмотреть файл

@ -4,11 +4,8 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Runtime.Serialization;
using System.Runtime.Serialization.Formatters.Binary;
using System.Text;
using System.Threading.Tasks;
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Proxy;
@ -39,17 +36,17 @@ namespace AdapterTest.Mocks
return new MockDStreamProxy();
}
public IDStreamProxy KafkaStream(Dictionary<string, int> topics, Dictionary<string, string> kafkaParams, Microsoft.Spark.CSharp.Core.StorageLevelType storageLevelType)
public IDStreamProxy KafkaStream(IEnumerable<Tuple<string, int>> topics, IEnumerable<Tuple<string, string>> kafkaParams, Microsoft.Spark.CSharp.Core.StorageLevelType storageLevelType)
{
return new MockDStreamProxy();
}
public IDStreamProxy DirectKafkaStream(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets)
public IDStreamProxy DirectKafkaStream(List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams, IEnumerable<Tuple<string, long>> fromOffsets)
{
return new MockDStreamProxy();
}
public IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets,
public IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams, IEnumerable<Tuple<string, long>> fromOffsets,
int numPartitions, byte[] readFunc, string serializationMode)
{
return new MockDStreamProxy();
@ -92,13 +89,23 @@ namespace AdapterTest.Mocks
public IDStreamProxy CreateCSharpReducedWindowedDStream(IDStreamProxy jdstream, byte[] func, byte[] invFunc, int windowSeconds, int slideSeconds, string serializationMode)
{
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> f = (Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>>)formatter.Deserialize(new MemoryStream(func));
RDD<dynamic> rdd = f(DateTime.UtcNow.Ticks,
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> f = (Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>>) formatter.Deserialize(new MemoryStream(func));
var ticks = DateTime.UtcNow.Ticks;
RDD<dynamic> rdd = f(ticks,
new RDD<dynamic>((jdstream as MockDStreamProxy).rddProxy ?? new MockRddProxy(null), new SparkContext("", "")),
new RDD<dynamic>((jdstream as MockDStreamProxy).rddProxy ?? new MockRddProxy(null), new SparkContext("", "")));
return new MockDStreamProxy(rdd.RddProxy);
}
if (invFunc == null) return new MockDStreamProxy(rdd.RddProxy);
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> invf = (Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>>) formatter.Deserialize(new MemoryStream(invFunc));
RDD<dynamic> invRdd = invf(ticks,
new RDD<dynamic>((jdstream as MockDStreamProxy).rddProxy ?? new MockRddProxy(null), new SparkContext("", "")),
new RDD<dynamic>((jdstream as MockDStreamProxy).rddProxy ?? new MockRddProxy(null), new SparkContext("", "")));
var difference = rdd.Subtract(invRdd);
return new MockDStreamProxy(difference.RddProxy);
}
public IDStreamProxy CreateCSharpStateDStream(IDStreamProxy jdstream, byte[] func, string className, string serializationMode, string serializationMode2)
{
@ -119,7 +126,7 @@ namespace AdapterTest.Mocks
return new MockDStreamProxy();
}
public IDStreamProxy EventHubsUnionStream(Dictionary<string, string> eventHubsParams, StorageLevelType storageLevelType)
public IDStreamProxy EventHubsUnionStream(IEnumerable<Tuple<string, string>> eventHubsParams, StorageLevelType storageLevelType)
{
throw new NotImplementedException();
}

Просмотреть файл

@ -10,7 +10,7 @@ namespace AdapterTest
[TestFixture]
public class PairRDDTest
{
private static RDD<KeyValuePair<string, int>> pairs;
private static RDD<Tuple<string, int>> pairs;
[OneTimeSetUp]
public static void Initialize()
@ -18,7 +18,7 @@ namespace AdapterTest
var sparkContext = new SparkContext(null);
var lines = sparkContext.TextFile(Path.GetTempFileName());
var words = lines.FlatMap(l => l.Split(' '));
pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));
pairs = words.Map(w => new Tuple<string, int>(w, 1));
}
[Test]
@ -27,7 +27,7 @@ namespace AdapterTest
foreach (var record in pairs.CountByKey())
{
// the 1st paramter of AreEqual() method is the expected value, the 2nd one is the acutal value.
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2);
}
}
@ -36,53 +36,53 @@ namespace AdapterTest
{
foreach (var record in pairs.GroupWith(pairs).Collect())
{
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item1.Count);
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item2.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item1.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item2.Count);
}
foreach (var record in pairs.GroupWith(pairs, pairs).Collect())
{
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item1.Count);
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item2.Count);
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item3.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item1.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item2.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item3.Count);
}
foreach (var record in pairs.GroupWith(pairs, pairs, pairs).Collect())
{
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item1.Count);
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item2.Count);
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item3.Count);
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item4.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item1.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item2.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item3.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item4.Count);
}
}
/// <summary>
/// Test RDD.GroupWith() method with different KeyValuePair<K,V> types.
/// Test RDD.GroupWith() method with different Tuple<K,V> types.
/// </summary>
[Test]
public void TestPairRddGroupWith2()
{
var pairs1 = pairs.Map(p => new KeyValuePair<string, double>(p.Key, Convert.ToDouble(p.Value)));
var pairs2 = pairs.Map(p => new KeyValuePair<string, string>(p.Key, p.Value.ToString()));
var pairs3 = pairs.Map(p => new KeyValuePair<string, long>(p.Key, Convert.ToInt64(p.Value)));
var pairs1 = pairs.Map(p => new Tuple<string, double>(p.Item1, Convert.ToDouble(p.Item2)));
var pairs2 = pairs.Map(p => new Tuple<string, string>(p.Item1, p.Item2.ToString()));
var pairs3 = pairs.Map(p => new Tuple<string, long>(p.Item1, Convert.ToInt64(p.Item2)));
foreach (var record in pairs.GroupWith(pairs1).Collect())
{
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item1.Count);
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item2.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item1.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item2.Count);
}
foreach (var record in pairs.GroupWith(pairs1, pairs2).Collect())
{
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item1.Count);
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item2.Count);
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item3.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item1.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item2.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item3.Count);
}
foreach (var record in pairs.GroupWith(pairs1, pairs2, pairs3).Collect())
{
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item1.Count);
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item2.Count);
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item3.Count);
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item4.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item1.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item2.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item3.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item4.Count);
}
}
@ -90,10 +90,10 @@ namespace AdapterTest
public void TestPairRddSubtractByKey()
{
var reduce = pairs.ReduceByKey((x, y) => x + y);
var records = reduce.SubtractByKey(reduce.Filter(kvp => kvp.Key != "The")).Collect();
var records = reduce.SubtractByKey(reduce.Filter(kvp => kvp.Item1 != "The")).Collect();
Assert.AreEqual(1, records.Length);
Assert.AreEqual("The", records[0].Key);
Assert.AreEqual(23, records[0].Value);
Assert.AreEqual("The", records[0].Item1);
Assert.AreEqual(23, records[0].Item2);
}
[Test]
@ -105,12 +105,45 @@ namespace AdapterTest
}
}
[Serializable]
private class IntWrapper
{
public IntWrapper(int value)
{
Value = value;
}
public int Value { get; }
}
[Test]
public void TestPairRddReduceByKeyWithObjects()
{
// The ReduceByKey method below fails with NPE if ReduceByKey
// calls CombineByKey with () => default(V) as seed generator
var sums = pairs
.MapValues(value => new IntWrapper(value))
.ReduceByKey((x, y) => new IntWrapper(x.Value + y.Value));
var result = sums
.CollectAsMap()
.Select(pair => new KeyValuePair<string, int>(pair.Key, pair.Value.Value))
.ToList();
var expectedResult = pairs
.ReduceByKey((x, y) => x + y)
.CollectAsMap()
.ToList();
Assert.That(result, Is.EquivalentTo(expectedResult));
}
[Test]
public void TestPairRddFoldByKey()
{
foreach (var record in pairs.FoldByKey(() => 0, (x, y) => x + y).Collect())
{
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2);
}
}
@ -119,7 +152,7 @@ namespace AdapterTest
{
foreach (var record in pairs.AggregateByKey(() => 0, (x, y) => x + y, (x, y) => x + y).Collect())
{
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2);
}
}
@ -128,7 +161,7 @@ namespace AdapterTest
{
foreach (var record in pairs.GroupByKey().Collect())
{
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Count);
}
}
@ -165,7 +198,7 @@ namespace AdapterTest
[Test]
public void TestPairRddSortByKey()
{
var expectedSortedRdd = pairs.Collect().OrderBy(kv => kv.Key, StringComparer.OrdinalIgnoreCase).ToArray();
var expectedSortedRdd = pairs.Collect().OrderBy(kv => kv.Item1, StringComparer.OrdinalIgnoreCase).ToArray();
var rddSortByKey = pairs.SortByKey(true, null, key => key.ToLowerInvariant()).Collect();
CollectionAssert.AreEqual(expectedSortedRdd, rddSortByKey);
}
@ -173,11 +206,19 @@ namespace AdapterTest
[Test]
public void TestPairRddSortByKey2()
{
var expectedSortedRdd = pairs.Collect().OrderBy(kv => kv.Key, StringComparer.OrdinalIgnoreCase).ToArray();
var expectedSortedRdd = pairs.Collect().OrderBy(kv => kv.Item1, StringComparer.OrdinalIgnoreCase).ToArray();
var rddSortByKey = pairs.SortByKey(true, 1, key => key.ToLowerInvariant()).Collect();
CollectionAssert.AreEqual(expectedSortedRdd, rddSortByKey);
}
[Test]
public void TestPairRddSortByKey3()
{
var expectedSortedRdd = pairs.Collect().OrderByDescending(kv => kv.Item1, StringComparer.OrdinalIgnoreCase).ToArray();
var rddSortByKey = pairs.SortByKey(false, 1, key => key.ToLowerInvariant()).Collect();
CollectionAssert.AreEqual(expectedSortedRdd, rddSortByKey);
}
[Test]
public void TestPairRddProxy()
{

Просмотреть файл

@ -20,6 +20,7 @@ namespace AdapterTest
public class RDDTest
{
private static RDD<string> words;
private static RDD<string> empty;
[OneTimeSetUp]
public static void Initialize()
@ -27,6 +28,7 @@ namespace AdapterTest
var sparkContext = new SparkContext(null);
var lines = sparkContext.TextFile(Path.GetTempFileName());
words = lines.FlatMap(l => l.Split(' '));
empty = sparkContext.EmptyRDD<string>();
}
[Test]
@ -42,7 +44,7 @@ namespace AdapterTest
{
foreach (var record in words.CountByValue())
{
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2);
}
}
@ -82,6 +84,7 @@ namespace AdapterTest
public void TestRddTreeAggregate()
{
Assert.AreEqual(201, words.Map(w => 1).TreeAggregate(0, (x, y) => x + y, (x, y) => x + y));
Assert.Throws<ArgumentException>(() => empty.TreeAggregate(0, (x, y) => 1, (x, y) => x + y, 0));
}
[Test]
@ -119,14 +122,14 @@ namespace AdapterTest
{
words.GroupBy(w => w).Foreach(record =>
{
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Count);
});
words.GroupBy(w => w).ForeachPartition(iter =>
{
foreach (var record in iter)
{
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Count);
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Count);
}
});
}
@ -135,6 +138,7 @@ namespace AdapterTest
public void TestRddIsEmpty()
{
Assert.IsFalse(words.IsEmpty());
Assert.IsTrue(empty.IsEmpty());
Assert.IsTrue(words.Filter(w => w == null).IsEmpty());
}
@ -144,7 +148,7 @@ namespace AdapterTest
int index = 0;
foreach(var record in words.ZipWithIndex().Collect())
{
Assert.AreEqual(index++, record.Value);
Assert.AreEqual(index++, record.Item2);
}
}
@ -155,7 +159,7 @@ namespace AdapterTest
int num = words.GetNumPartitions();
foreach (var record in words.ZipWithUniqueId().Collect())
{
Assert.AreEqual(num * index++, record.Value);
Assert.AreEqual(num * index++, record.Item2);
}
}
@ -166,6 +170,7 @@ namespace AdapterTest
Assert.AreEqual(20, words.TakeSample(true, 20, 1).Length);
Assert.Throws<ArgumentException>(() => words.TakeSample(true, -1, 1));
Assert.AreEqual(0, words.TakeSample(true, 0, 1).Length);
Assert.AreEqual(20, words.TakeSample(false, 20, 1).Length);
}
[Test]

Просмотреть файл

@ -152,6 +152,23 @@ namespace AdapterTest
Assert.IsNotNull(hadoopConf);
}
[Test]
public void TestRunJob()
{
// Arrange
Mock<ISparkContextProxy> sparkContextProxy = new Mock<ISparkContextProxy>();
SparkContext sc = new SparkContext(sparkContextProxy.Object, null);
RDD<int> rdd = sc.Parallelize(new int[] {0, 1, 2, 3, 4, 5}, 2);
sparkContextProxy.Setup(m => m.RunJob(It.IsAny<IRDDProxy>(), It.IsAny<IEnumerable<int>>()));
// Act
int[] partitions = new int[] { 0, 1 };
rdd.SparkContext.RunJob(rdd, partitions);
// Assert
sparkContextProxy.Verify(m => m.RunJob(rdd.RddProxy, partitions), Times.Once);
}
[Test]
public void TestCancelAllJobs()
{
@ -355,7 +372,7 @@ namespace AdapterTest
SparkContext sc = new SparkContext(sparkContextProxy.Object, null);
// Act
RDD<KeyValuePair<byte[], byte[]>> rdd = sc.WholeTextFiles(filePath, null);
RDD<Tuple<byte[], byte[]>> rdd = sc.WholeTextFiles(filePath, null);
// Assert
Assert.IsNotNull(rdd);
@ -377,7 +394,7 @@ namespace AdapterTest
SparkContext sc = new SparkContext(sparkContextProxy.Object, null);
// Act
RDD<KeyValuePair<byte[], byte[]>> rdd = sc.BinaryFiles(filePath, null);
RDD<Tuple<byte[], byte[]>> rdd = sc.BinaryFiles(filePath, null);
// Assert
Assert.IsNotNull(rdd);
@ -428,7 +445,7 @@ namespace AdapterTest
Mock<IRDDProxy> rddProxy = new Mock<IRDDProxy>();
Mock<ISparkContextProxy> sparkContextProxy = new Mock<ISparkContextProxy>();
sparkContextProxy.Setup(m => m.NewAPIHadoopFile(filePath, It.IsAny<string>(), keyClass, valueClass, keyConverterClass, valueConverterClass, It.IsAny<IEnumerable<KeyValuePair<string, string>>>(), It.IsAny<int>()))
sparkContextProxy.Setup(m => m.NewAPIHadoopFile(filePath, It.IsAny<string>(), keyClass, valueClass, keyConverterClass, valueConverterClass, It.IsAny<IEnumerable<Tuple<string, string>>>(), It.IsAny<int>()))
.Returns(rddProxy.Object);
SparkContext sc = new SparkContext(sparkContextProxy.Object, null);
@ -456,7 +473,7 @@ namespace AdapterTest
Mock<IRDDProxy> rddProxy = new Mock<IRDDProxy>();
Mock<ISparkContextProxy> sparkContextProxy = new Mock<ISparkContextProxy>();
sparkContextProxy.Setup(m => m.HadoopFile(filePath, It.IsAny<string>(), keyClass, valueClass, keyConverterClass, valueConverterClass, It.IsAny<IEnumerable<KeyValuePair<string, string>>>(), It.IsAny<int>()))
sparkContextProxy.Setup(m => m.HadoopFile(filePath, It.IsAny<string>(), keyClass, valueClass, keyConverterClass, valueConverterClass, It.IsAny<IEnumerable<Tuple<string, string>>>(), It.IsAny<int>()))
.Returns(rddProxy.Object);
SparkContext sc = new SparkContext(sparkContextProxy.Object, null);
@ -482,12 +499,12 @@ namespace AdapterTest
Mock<IRDDProxy> rddProxy = new Mock<IRDDProxy>();
Mock<ISparkContextProxy> sparkContextProxy = new Mock<ISparkContextProxy>();
sparkContextProxy.Setup(m => m.NewAPIHadoopRDD(It.IsAny<string>(), keyClass, valueClass, keyConverterClass, valueConverterClass, It.IsAny<IEnumerable<KeyValuePair<string, string>>>(), It.IsAny<int>()))
sparkContextProxy.Setup(m => m.NewAPIHadoopRDD(It.IsAny<string>(), keyClass, valueClass, keyConverterClass, valueConverterClass, It.IsAny<IEnumerable<Tuple<string, string>>>(), It.IsAny<int>()))
.Returns(rddProxy.Object);
SparkContext sc = new SparkContext(sparkContextProxy.Object, null);
const string inputFormatClass = "org.apache.hadoop.mapreduce.lib.input.TextInputFormat";
var conf = new KeyValuePair<string, string>[] { };
var conf = new Tuple<string, string>[] { };
// Act
RDD<byte[]> rdd = sc.NewAPIHadoopRDD(inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, conf);
@ -509,12 +526,12 @@ namespace AdapterTest
Mock<IRDDProxy> rddProxy = new Mock<IRDDProxy>();
Mock<ISparkContextProxy> sparkContextProxy = new Mock<ISparkContextProxy>();
sparkContextProxy.Setup(m => m.HadoopRDD(It.IsAny<string>(), keyClass, valueClass, keyConverterClass, valueConverterClass, It.IsAny<IEnumerable<KeyValuePair<string, string>>>(), It.IsAny<int>()))
sparkContextProxy.Setup(m => m.HadoopRDD(It.IsAny<string>(), keyClass, valueClass, keyConverterClass, valueConverterClass, It.IsAny<IEnumerable<Tuple<string, string>>>(), It.IsAny<int>()))
.Returns(rddProxy.Object);
SparkContext sc = new SparkContext(sparkContextProxy.Object, null);
const string inputFormatClass = "org.apache.hadoop.mapreduce.lib.input.TextInputFormat";
var conf = new KeyValuePair<string, string>[] { };
var conf = new Tuple<string, string>[] { };
// Act
RDD<byte[]> rdd = sc.HadoopRDD(inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, conf);

Просмотреть файл

@ -32,22 +32,22 @@ namespace AdapterTest
var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345);
Assert.IsNotNull(socketStream.DStreamProxy);
var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new Dictionary<string, int> { { "testTopic1", 1 } }, new Dictionary<string, string>());
var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new [] { Tuple.Create("testTopic1", 1) }, null);
Assert.IsNotNull(kafkaStream.DStreamProxy);
var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List<string> { "testTopic2" }, new Dictionary<string, string>(), new Dictionary<string, long>());
var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List<string> { "testTopic2" }, new List<Tuple<string, string>>(), new List<Tuple<string, long>>());
Assert.IsNotNull(directKafkaStream.DStreamProxy);
ssc.SparkContext.SparkConf.Set("spark.mobius.streaming.kafka.numPartitions.testTopic3", "10");
var directKafkaStreamWithRepartition = KafkaUtils.CreateDirectStream(ssc, new List<string> { "testTopic3" }, new Dictionary<string, string>(), new Dictionary<string, long>());
var directKafkaStreamWithRepartition = KafkaUtils.CreateDirectStream(ssc, new List<string> { "testTopic3" }, new List<Tuple<string, string>>(), new List<Tuple<string, long>>());
Assert.IsNotNull(directKafkaStreamWithRepartition.DStreamProxy);
var directKafkaStreamWithRepartitionAndReadFunc = KafkaUtils.CreateDirectStream(
ssc,
new List<string> { "testTopic3" },
new Dictionary<string, string>(), new Dictionary<string, long>(),
(int pid, IEnumerable<KeyValuePair<byte[], byte[]>> input) => { return input; });
new List<Tuple<string, string>>(), new List<Tuple<string, long>>(),
(int pid, IEnumerable<Tuple<byte[], byte[]>> input) => { return input; });
Assert.IsNotNull(directKafkaStreamWithRepartitionAndReadFunc);
ssc.SparkContext.SparkConf.Set("spark.mobius.streaming.kafka.numReceivers", "10");
@ -55,8 +55,8 @@ namespace AdapterTest
var directKafkaReceiver = KafkaUtils.CreateDirectStream(
ssc,
new List<string> { "testTopic3" },
new Dictionary<string, string>(), new Dictionary<string, long>(),
(int pid, IEnumerable<KeyValuePair<byte[], byte[]>> input) => { return input; });
new List<Tuple<string, string>>(), new List<Tuple<string, long>>(),
(int pid, IEnumerable<Tuple<byte[], byte[]>> input) => { return input; });
Assert.IsNotNull(directKafkaReceiver.DStreamProxy);
var union = ssc.Union(textFile, socketStream);
@ -99,10 +99,10 @@ namespace AdapterTest
byte[] untilOffset = BitConverter.GetBytes(3L);
Array.Reverse(untilOffset);
var offsetRange = KafkaUtils.GetOffsetRange(new List<KeyValuePair<byte[], byte[]>>
var offsetRange = KafkaUtils.GetOffsetRange(new List<Tuple<byte[], byte[]>>
{
new KeyValuePair<byte[], byte[]>(Encoding.UTF8.GetBytes("testTopic,testClusterId"), partition),
new KeyValuePair<byte[], byte[]>(fromOffset, untilOffset)
new Tuple<byte[], byte[]>(Encoding.UTF8.GetBytes("testTopic,testClusterId"), partition),
new Tuple<byte[], byte[]>(fromOffset, untilOffset)
});
Assert.AreEqual(offsetRange.Topic, "testTopic");

Просмотреть файл

@ -175,7 +175,7 @@ namespace AdapterTest
// Act
var lines = _streamingContext.TextFileStream(Path.GetTempPath());
var words = lines.FlatMap(l => l.Split(' '));
var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));
var pairs = words.Map(w => new Tuple<string, int>(w, 1));
var wordCounts = pairs.ReduceByKey((x, y) => x + y);
// Assert
@ -186,8 +186,8 @@ namespace AdapterTest
foreach (object record in taken)
{
KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22);
Tuple<string, int> countByWord = (Tuple<string, int>)record;
Assert.AreEqual(countByWord.Item2, countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 23 : 22);
}
});
// Use Verify to verify if a method to mock was invoked

Просмотреть файл

@ -0,0 +1,57 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using Microsoft.Spark.CSharp.Proxy;
using Microsoft.Spark.CSharp.Sql;
using Moq;
using NUnit.Framework;
namespace AdapterTest
{
[TestFixture]
public class UdfRegistrationTest
{
[Test]
public void TestRegisterFunction()
{
Mock<IUdfRegistrationProxy> mockUdfRegistrationProxy = new Mock<IUdfRegistrationProxy>();
mockUdfRegistrationProxy.Setup(m => m.RegisterFunction(It.IsAny<string>(), It.IsAny<byte[]>(), It.IsAny<string>()));
var udfRegistration = new UdfRegistration(mockUdfRegistrationProxy.Object);
udfRegistration.RegisterFunction("Func0", () => "Func0");
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func0", It.IsAny<byte[]>(), "string"));
udfRegistration.RegisterFunction<string, string>("Func1", s => "Func1");
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func1", It.IsAny<byte[]>(), "string"));
udfRegistration.RegisterFunction<string, string, string>("Func2", (s1, s2) => "Func2");
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func2", It.IsAny<byte[]>(), "string"));
udfRegistration.RegisterFunction<string, string, string, string>("Func3", (s1, s2, s3) => "Func3");
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func3", It.IsAny<byte[]>(), "string"));
udfRegistration.RegisterFunction<string, string, string, string, string>("Func4", (s1, s2, s3, s4) => "Func4");
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func4", It.IsAny<byte[]>(), "string"));
udfRegistration.RegisterFunction<string, string, string, string, string, string>("Func5", (s1, s2, s3, s4, s5) => "Func5");
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func5", It.IsAny<byte[]>(), "string"));
udfRegistration.RegisterFunction<string, string, string, string, string, string, string>("Func6", (s1, s2, s3, s4, s5, s6) => "Func6");
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func6", It.IsAny<byte[]>(), "string"));
udfRegistration.RegisterFunction<string, string, string, string, string, string, string, string>("Func7", (s1, s2, s3, s4, s5, s6, s7) => "Func7");
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func7", It.IsAny<byte[]>(), "string"));
udfRegistration.RegisterFunction<string, string, string, string, string, string, string, string, string>("Func8", (s1, s2, s3, s4, s5, s6, s7, s8) => "Func8");
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func8", It.IsAny<byte[]>(), "string"));
udfRegistration.RegisterFunction<string, string, string, string, string, string, string, string, string, string>("Func9", (s1, s2, s3, s4, s5, s6, s7, s8, s9) => "Func9");
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func9", It.IsAny<byte[]>(), "string"));
udfRegistration.RegisterFunction<string, string, string, string, string, string, string, string, string, string, string>("Func10", (s1, s2, s3, s4, s5, s6, s7, s8, s9, s10) => "Func10");
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func10", It.IsAny<byte[]>(), "string"));
}
}
}

Просмотреть файл

@ -66,11 +66,11 @@ namespace Microsoft.Spark.CSharp.PerfBenchmark
var flaggedRows = parsedRows.Filter(s => s.Item1); //select good rows
var selectedDeletions = flaggedRows.Filter(s => s.Item3.Equals(s.Item5)); //select deletions made by same creators
var userDeletions = selectedDeletions.Map(s => new KeyValuePair<string, int>(s.Item3, 1));
var userDeletions = selectedDeletions.Map(s => new Tuple<string, int>(s.Item3, 1));
var userDeletionCount = userDeletions.ReduceByKey((x, y) => x + y);
var userWithMaxDeletions = userDeletionCount.Fold(new KeyValuePair<string, int>("zerovalue", 0), (kvp1, kvp2) =>
var userWithMaxDeletions = userDeletionCount.Fold(new Tuple<string, int>("zerovalue", 0), (kvp1, kvp2) =>
{
if (kvp1.Value > kvp2.Value)
if (kvp1.Item2 > kvp2.Item2)
return kvp1;
else
return kvp2;
@ -79,7 +79,7 @@ namespace Microsoft.Spark.CSharp.PerfBenchmark
stopwatch.Stop();
PerfBenchmark.ExecutionTimeList.Add(stopwatch.Elapsed);
Console.WriteLine("User with max deletions is {0}, count of deletions={1}. Elapsed time={2}", userWithMaxDeletions.Key, userWithMaxDeletions.Value, stopwatch.Elapsed);
Console.WriteLine("User with max deletions is {0}, count of deletions={1}. Elapsed time={2}", userWithMaxDeletions.Item1, userWithMaxDeletions.Item2, stopwatch.Elapsed);
}
[PerfSuite]

Просмотреть файл

@ -75,14 +75,14 @@ namespace Microsoft.Spark.CSharp
var lines = context.TextFileStream(Path.Combine(directory, "test"));
lines = context.Union(lines, lines);
var words = lines.FlatMap(l => l.Split(' '));
var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));
var pairs = words.Map(w => new Tuple<string, int>(w, 1));
// since operations like ReduceByKey, Join and UpdateStateByKey are
// separate dstream transformations defined in CSharpDStream.scala
// an extra CSharpRDD is introduced in between these operations
var wordCounts = pairs.ReduceByKey((x, y) => x + y);
var join = wordCounts.Window(2, 2).Join(wordCounts, 2);
var initialStateRdd = sc.Parallelize( new[] {new KeyValuePair<string, int>("AAA", 88), new KeyValuePair<string, int>("BBB", 88)});
var initialStateRdd = sc.Parallelize( new[] {new Tuple<string, int>("AAA", 88), new Tuple<string, int>("BBB", 88)});
var state = join.UpdateStateByKey(new UpdateStateHelper(b).Execute, initialStateRdd);
state.ForeachRDD((time, rdd) =>
@ -99,8 +99,8 @@ namespace Microsoft.Spark.CSharp
{
Console.WriteLine(record);
var countByWord = (KeyValuePair<string, int>)record;
Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "lazy" || countByWord.Key == "dog" ? 92 : 88);
var countByWord = (Tuple<string, int>)record;
Assert.AreEqual(countByWord.Item2, countByWord.Item1 == "The" || countByWord.Item1 == "lazy" || countByWord.Item1 == "dog" ? 92 : 88);
}
Console.WriteLine();
@ -145,13 +145,13 @@ namespace Microsoft.Spark.CSharp
StreamingContext context = new StreamingContext(sc, 2000L);
context.Checkpoint(checkpointPath);
var kafkaParams = new Dictionary<string, string> {
{"metadata.broker.list", brokers},
{"auto.offset.reset", "smallest"}
var kafkaParams = new List<Tuple<string, string>> {
new Tuple<string, string>("metadata.broker.list", brokers),
new Tuple<string, string>("auto.offset.reset", "smallest")
};
conf.Set("spark.mobius.streaming.kafka.numPartitions." + topic, partitions.ToString());
var dstream = KafkaUtils.CreateDirectStream(context, new List<string> { topic }, kafkaParams, new Dictionary<string, long>());
var dstream = KafkaUtils.CreateDirectStream(context, new List<string> { topic }, kafkaParams, Enumerable.Empty<Tuple<string, long>>());
dstream.ForeachRDD((time, rdd) =>
{
@ -256,7 +256,7 @@ namespace Microsoft.Spark.CSharp
// create the RDD
var seedRDD = sc.Parallelize(Enumerable.Range(0, 100), numPartitions);
var numbers = new ConstantInputDStream<int>(seedRDD, ssc);
var pairs = numbers.Map(n => new KeyValuePair<int, int>(n % numPartitions, n));
var pairs = numbers.Map(n => new Tuple<int, int>(n % numPartitions, n));
var reduced = pairs.ReduceByKeyAndWindow(
(int x, int y) => (x + y),
(int x, int y) => (x - y),
@ -283,10 +283,10 @@ namespace Microsoft.Spark.CSharp
foreach (object record in taken)
{
KeyValuePair<int, int> sum = (KeyValuePair<int, int>)record;
Console.WriteLine("Key: {0}, Value: {1}", sum.Key, sum.Value);
Tuple<int, int> sum = (Tuple<int, int>)record;
Console.WriteLine("Key: {0}, Value: {1}", sum.Item1, sum.Item2);
// when batch count reaches window size, sum of even/odd number stay at windowDuration / slideDuration * (2450, 2500) respectively
Assert.AreEqual(sum.Value, (count > windowDuration / slideDuration ? windowDuration : count * slideDuration) / (bacthIntervalMs / 1000) * (sum.Key == 0 ? 2450 : 2500));
Assert.AreEqual(sum.Item2, (count > windowDuration / slideDuration ? windowDuration : count * slideDuration) / (bacthIntervalMs / 1000) * (sum.Item1 == 0 ? 2450 : 2500));
}
});

Просмотреть файл

@ -60,16 +60,16 @@ namespace Microsoft.Spark.CSharp.Samples
var lines = context.TextFileStream(Path.Combine(directory, "test1"));
lines = context.Union(lines, lines);
var words = lines.FlatMap(l => l.Split(' '));
var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));
var pairs = words.Map(w => new Tuple<string, int>(w, 1));
var wordCounts = pairs.ReduceByKey((x, y) => x + y);
var initialState = sc.Parallelize(new[] { new KeyValuePair<string, int>("NOT_A_WORD", 1024), new KeyValuePair<string, int>("dog", 10000), }, 1);
var stateSpec = new StateSpec<string, int, int, KeyValuePair<string, int>>((word, count, state) =>
var initialState = sc.Parallelize(new[] { new Tuple<string, int>("NOT_A_WORD", 1024), new Tuple<string, int>("dog", 10000), }, 1);
var stateSpec = new StateSpec<string, int, int, Tuple<string, int>>((word, count, state) =>
{
if (state.IsTimingOut())
{
Console.WriteLine("Found timing out word: {0}", word);
return new KeyValuePair<string, int>(word, state.Get());
return new Tuple<string, int>(word, state.Get());
}
var sum = 0;
@ -79,7 +79,7 @@ namespace Microsoft.Spark.CSharp.Samples
}
state.Update(sum + count);
Console.WriteLine("word: {0}, count: {1}", word, sum + count);
return new KeyValuePair<string, int>(word, sum + count);
return new Tuple<string, int>(word, sum + count);
}).NumPartitions(1).InitialState(initialState).Timeout(TimeSpan.FromSeconds(30));
var snapshots = wordCounts.MapWithState(stateSpec).StateSnapshots();
@ -89,9 +89,9 @@ namespace Microsoft.Spark.CSharp.Samples
Console.WriteLine("Snapshots @ Time: {0}", time);
Console.WriteLine("-------------------------------------------");
foreach (KeyValuePair<string, int> record in rdd.Collect())
foreach (Tuple<string, int> record in rdd.Collect())
{
Console.WriteLine("[{0}, {1}]", record.Key, record.Value);
Console.WriteLine("[{0}, {1}]", record.Item1, record.Item2);
}
Console.WriteLine();
});

Просмотреть файл

@ -15,7 +15,7 @@ namespace Microsoft.Spark.CSharp.Samples
[Sample]
internal static void PairRDDCollectAsMapSample()
{
var map = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<int, int>(1, 2), new KeyValuePair<int, int>(3, 4) }, 1).CollectAsMap();
var map = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<int, int>(1, 2), new Tuple<int, int>(3, 4) }, 1).CollectAsMap();
foreach (var kv in map)
Console.WriteLine(kv);
@ -30,7 +30,7 @@ namespace Microsoft.Spark.CSharp.Samples
[Sample]
internal static void PairRDDKeysSample()
{
var keys = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<int, int>(1, 2), new KeyValuePair<int, int>(3, 4) }, 1).Keys().Collect();
var keys = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<int, int>(1, 2), new Tuple<int, int>(3, 4) }, 1).Keys().Collect();
Console.WriteLine(keys[0]);
Console.WriteLine(keys[1]);
@ -45,7 +45,7 @@ namespace Microsoft.Spark.CSharp.Samples
[Sample]
internal static void PairRDDValuesSample()
{
var values = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<int, int>(1, 2), new KeyValuePair<int, int>(3, 4) }, 1).Values().Collect();
var values = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<int, int>(1, 2), new Tuple<int, int>(3, 4) }, 1).Values().Collect();
Console.WriteLine(values[0]);
Console.WriteLine(values[1]);
@ -63,9 +63,9 @@ namespace Microsoft.Spark.CSharp.Samples
var reduced = SparkCLRSamples.SparkContext.Parallelize(
new[]
{
new KeyValuePair<string, int>("a", 1),
new KeyValuePair<string, int>("b", 1),
new KeyValuePair<string, int>("a", 1)
new Tuple<string, int>("a", 1),
new Tuple<string, int>("b", 1),
new Tuple<string, int>("a", 1)
}, 2)
.ReduceByKey((x, y) => x + y).Collect();
@ -74,8 +74,8 @@ namespace Microsoft.Spark.CSharp.Samples
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.IsTrue(reduced.Contains(new KeyValuePair<string, int>("a", 2)));
Assert.IsTrue(reduced.Contains(new KeyValuePair<string, int>("b", 1)));
Assert.IsTrue(reduced.Contains(new Tuple<string, int>("a", 2)));
Assert.IsTrue(reduced.Contains(new Tuple<string, int>("b", 1)));
}
}
@ -85,9 +85,9 @@ namespace Microsoft.Spark.CSharp.Samples
var reduced = SparkCLRSamples.SparkContext.Parallelize(
new[]
{
new KeyValuePair<string, int>("a", 1),
new KeyValuePair<string, int>("b", 1),
new KeyValuePair<string, int>("a", 1)
new Tuple<string, int>("a", 1),
new Tuple<string, int>("b", 1),
new Tuple<string, int>("a", 1)
}, 2)
.ReduceByKeyLocally((x, y) => x + y);
@ -107,11 +107,12 @@ namespace Microsoft.Spark.CSharp.Samples
var countByKey = SparkCLRSamples.SparkContext.Parallelize(
new[]
{
new KeyValuePair<string, int>("a", 1),
new KeyValuePair<string, int>("b", 1),
new KeyValuePair<string, int>("a", 1)
new Tuple<string, int>("a", 1),
new Tuple<string, int>("b", 1),
new Tuple<string, int>("a", 1)
}, 2)
.CountByKey();
.CountByKey()
.ToDictionary(k => k.Item1, v => v.Item2);
foreach (var kv in countByKey)
Console.WriteLine(kv);
@ -129,15 +130,15 @@ namespace Microsoft.Spark.CSharp.Samples
var l = SparkCLRSamples.SparkContext.Parallelize(
new[]
{
new KeyValuePair<string, int>("a", 1),
new KeyValuePair<string, int>("b", 4),
new Tuple<string, int>("a", 1),
new Tuple<string, int>("b", 4),
}, 1);
var r = SparkCLRSamples.SparkContext.Parallelize(
new[]
{
new KeyValuePair<string, int>("a", 2),
new KeyValuePair<string, int>("a", 3),
new Tuple<string, int>("a", 2),
new Tuple<string, int>("a", 3),
}, 1);
var joined = l.Join(r, 2).Collect();
@ -147,8 +148,8 @@ namespace Microsoft.Spark.CSharp.Samples
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.IsTrue(joined.Contains(new KeyValuePair<string, Tuple<int, int>>("a", new Tuple<int, int>(1, 2))));
Assert.IsTrue(joined.Contains(new KeyValuePair<string, Tuple<int, int>>("a", new Tuple<int, int>(1, 3))));
Assert.IsTrue(joined.Contains(new Tuple<string, Tuple<int, int>>("a", new Tuple<int, int>(1, 2))));
Assert.IsTrue(joined.Contains(new Tuple<string, Tuple<int, int>>("a", new Tuple<int, int>(1, 3))));
}
}
@ -158,14 +159,14 @@ namespace Microsoft.Spark.CSharp.Samples
var l = SparkCLRSamples.SparkContext.Parallelize(
new[]
{
new KeyValuePair<string, int>("a", 1),
new KeyValuePair<string, int>("b", 4),
new Tuple<string, int>("a", 1),
new Tuple<string, int>("b", 4),
}, 2);
var r = SparkCLRSamples.SparkContext.Parallelize(
new[]
{
new KeyValuePair<string, int>("a", 2),
new Tuple<string, int>("a", 2),
}, 1);
var joined = l.LeftOuterJoin(r).Collect();
@ -175,8 +176,8 @@ namespace Microsoft.Spark.CSharp.Samples
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.IsTrue(joined.Any(kv => kv.Key == "a" && kv.Value.Item1 == 1 && kv.Value.Item2.IsDefined && kv.Value.Item2.GetValue() == 2));
Assert.IsTrue(joined.Any(kv => kv.Key == "b" && kv.Value.Item1 == 4 && !kv.Value.Item2.IsDefined));
Assert.IsTrue(joined.Any(kv => kv.Item1 == "a" && kv.Item2.Item1 == 1 && kv.Item2.Item2.IsDefined && kv.Item2.Item2.GetValue() == 2));
Assert.IsTrue(joined.Any(kv => kv.Item1 == "b" && kv.Item2.Item1 == 4 && !kv.Item2.Item2.IsDefined));
}
}
@ -186,14 +187,14 @@ namespace Microsoft.Spark.CSharp.Samples
var l = SparkCLRSamples.SparkContext.Parallelize(
new[]
{
new KeyValuePair<string, int>("a", 2),
new Tuple<string, int>("a", 2),
}, 1);
var r = SparkCLRSamples.SparkContext.Parallelize(
new[]
{
new KeyValuePair<string, int>("a", 1),
new KeyValuePair<string, int>("b", 4),
new Tuple<string, int>("a", 1),
new Tuple<string, int>("b", 4),
}, 2);
var joined = l.RightOuterJoin(r).Collect();
@ -203,8 +204,8 @@ namespace Microsoft.Spark.CSharp.Samples
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.IsTrue(joined.Any(kv => kv.Key == "a" && kv.Value.Item1.IsDefined && kv.Value.Item1.GetValue() == 2 && kv.Value.Item2 == 1));
Assert.IsTrue(joined.Any(kv => kv.Key == "b" && !kv.Value.Item1.IsDefined && kv.Value.Item2 == 4));
Assert.IsTrue(joined.Any(kv => kv.Item1 == "a" && kv.Item2.Item1.IsDefined && kv.Item2.Item1.GetValue() == 2 && kv.Item2.Item2 == 1));
Assert.IsTrue(joined.Any(kv => kv.Item1 == "b" && !kv.Item2.Item1.IsDefined && kv.Item2.Item2 == 4));
}
}
@ -214,15 +215,15 @@ namespace Microsoft.Spark.CSharp.Samples
var l = SparkCLRSamples.SparkContext.Parallelize(
new[]
{
new KeyValuePair<string, int>("a", 1),
new KeyValuePair<string, int>("b", 4),
new Tuple<string, int>("a", 1),
new Tuple<string, int>("b", 4),
}, 2);
var r = SparkCLRSamples.SparkContext.Parallelize(
new[]
{
new KeyValuePair<string, int>("a", 2),
new KeyValuePair<string, int>("c", 8),
new Tuple<string, int>("a", 2),
new Tuple<string, int>("c", 8),
}, 2);
var joined = l.FullOuterJoin(r).Collect();
@ -232,12 +233,12 @@ namespace Microsoft.Spark.CSharp.Samples
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.IsTrue(joined.Any(kv => kv.Key == "a" && kv.Value.Item1.IsDefined && kv.Value.Item1.GetValue() == 1 &&
kv.Value.Item2.IsDefined && kv.Value.Item2.GetValue() == 2));
Assert.IsTrue(joined.Any(kv => kv.Key == "b" && kv.Value.Item1.IsDefined && kv.Value.Item1.GetValue() == 4 &&
!kv.Value.Item2.IsDefined));
Assert.IsTrue(joined.Any(kv => kv.Key == "c" && !kv.Value.Item1.IsDefined &&
kv.Value.Item2.IsDefined && kv.Value.Item2.GetValue() == 8));
Assert.IsTrue(joined.Any(kv => kv.Item1 == "a" && kv.Item2.Item1.IsDefined && kv.Item2.Item1.GetValue() == 1 &&
kv.Item2.Item2.IsDefined && kv.Item2.Item2.GetValue() == 2));
Assert.IsTrue(joined.Any(kv => kv.Item1 == "b" && kv.Item2.Item1.IsDefined && kv.Item2.Item1.GetValue() == 4 &&
!kv.Item2.Item2.IsDefined));
Assert.IsTrue(joined.Any(kv => kv.Item1 == "c" && !kv.Item2.Item1.IsDefined &&
kv.Item2.Item2.IsDefined && kv.Item2.Item2.GetValue() == 8));
}
}
@ -252,7 +253,7 @@ namespace Microsoft.Spark.CSharp.Samples
};
var partitioned = SparkCLRSamples.SparkContext.Parallelize(new[] { 1, 2, 3, 4, 5, 6, 1 }, 3)
.Map(x => new KeyValuePair<int, int>(x, x + 100))
.Map(x => new Tuple<int, int>(x, x + 100))
.PartitionBy(3, partitionFunc)
.Glom()
.Collect();
@ -270,9 +271,9 @@ namespace Microsoft.Spark.CSharp.Samples
{
Assert.AreEqual(3, partitioned.Length);
// Assert that the partition distribution is correct with partitionFunc
Assert.IsTrue(partitioned.Count(p => p.All(key => key.Key < 3)) == 1);
Assert.IsTrue(partitioned.Count(p => p.All(key => key.Key >= 3 && key.Key < 6)) == 1);
Assert.IsTrue(partitioned.Count(p => p.All(key => key.Key >= 6)) == 1);
Assert.IsTrue(partitioned.Count(p => p.All(key => key.Item1 < 3)) == 1);
Assert.IsTrue(partitioned.Count(p => p.All(key => key.Item1 >= 3 && key.Item1 < 6)) == 1);
Assert.IsTrue(partitioned.Count(p => p.All(key => key.Item1 >= 6)) == 1);
}
}
@ -282,9 +283,9 @@ namespace Microsoft.Spark.CSharp.Samples
var combineByKey = SparkCLRSamples.SparkContext.Parallelize(
new[]
{
new KeyValuePair<string, int>("a", 1),
new KeyValuePair<string, int>("b", 1),
new KeyValuePair<string, int>("a", 1)
new Tuple<string, int>("a", 1),
new Tuple<string, int>("b", 1),
new Tuple<string, int>("a", 1)
}, 2)
.CombineByKey(() => string.Empty, (x, y) => x + y.ToString(CultureInfo.InvariantCulture), (x, y) => x + y).Collect();
@ -293,8 +294,8 @@ namespace Microsoft.Spark.CSharp.Samples
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.IsTrue(combineByKey.Contains(new KeyValuePair<string, string>("a", "11")));
Assert.IsTrue(combineByKey.Contains(new KeyValuePair<string, string>("b", "1")));
Assert.IsTrue(combineByKey.Contains(new Tuple<string, string>("a", "11")));
Assert.IsTrue(combineByKey.Contains(new Tuple<string, string>("b", "1")));
}
}
@ -304,9 +305,9 @@ namespace Microsoft.Spark.CSharp.Samples
var aggregateByKey = SparkCLRSamples.SparkContext.Parallelize(
new[]
{
new KeyValuePair<string, int>("a", 1),
new KeyValuePair<string, int>("b", 1),
new KeyValuePair<string, int>("a", 1)
new Tuple<string, int>("a", 1),
new Tuple<string, int>("b", 1),
new Tuple<string, int>("a", 1)
}, 2)
.AggregateByKey(() => 0, (x, y) => x + y, (x, y) => x + y).Collect();
@ -315,8 +316,8 @@ namespace Microsoft.Spark.CSharp.Samples
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.IsTrue(aggregateByKey.Contains(new KeyValuePair<string, int>("a", 2)));
Assert.IsTrue(aggregateByKey.Contains(new KeyValuePair<string, int>("b", 1)));
Assert.IsTrue(aggregateByKey.Contains(new Tuple<string, int>("a", 2)));
Assert.IsTrue(aggregateByKey.Contains(new Tuple<string, int>("b", 1)));
}
}
@ -326,9 +327,9 @@ namespace Microsoft.Spark.CSharp.Samples
var FoldByKey = SparkCLRSamples.SparkContext.Parallelize(
new[]
{
new KeyValuePair<string, int>("a", 1),
new KeyValuePair<string, int>("b", 1),
new KeyValuePair<string, int>("a", 1)
new Tuple<string, int>("a", 1),
new Tuple<string, int>("b", 1),
new Tuple<string, int>("a", 1)
}, 2)
.FoldByKey(() => 0, (x, y) => x + y).Collect();
@ -337,8 +338,8 @@ namespace Microsoft.Spark.CSharp.Samples
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.IsTrue(FoldByKey.Contains(new KeyValuePair<string, int>("a", 2)));
Assert.IsTrue(FoldByKey.Contains(new KeyValuePair<string, int>("b", 1)));
Assert.IsTrue(FoldByKey.Contains(new Tuple<string, int>("a", 2)));
Assert.IsTrue(FoldByKey.Contains(new Tuple<string, int>("b", 1)));
}
}
@ -348,19 +349,19 @@ namespace Microsoft.Spark.CSharp.Samples
var groupByKey = SparkCLRSamples.SparkContext.Parallelize(
new[]
{
new KeyValuePair<string, int>("a", 1),
new KeyValuePair<string, int>("b", 1),
new KeyValuePair<string, int>("a", 1)
new Tuple<string, int>("a", 1),
new Tuple<string, int>("b", 1),
new Tuple<string, int>("a", 1)
}, 2)
.GroupByKey().Collect();
foreach (var kv in groupByKey)
Console.WriteLine(kv.Key + ", " + "(" + string.Join(",", kv.Value) + ")");
Console.WriteLine(kv.Item1 + ", " + "(" + string.Join(",", kv.Item2) + ")");
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.IsTrue(groupByKey.Any(kv => kv.Key == "a" && kv.Value.Count == 2 && kv.Value[0] == 1 && kv.Value[1] == 1));
Assert.IsTrue(groupByKey.Any(kv => kv.Key == "b" && kv.Value.Count == 1 && kv.Value[0] == 1));
Assert.IsTrue(groupByKey.Any(kv => kv.Item1 == "a" && kv.Item2.Count == 2 && kv.Item2[0] == 1 && kv.Item2[1] == 1));
Assert.IsTrue(groupByKey.Any(kv => kv.Item1 == "b" && kv.Item2.Count == 1 && kv.Item2[0] == 1));
}
}
@ -370,8 +371,8 @@ namespace Microsoft.Spark.CSharp.Samples
var mapValues = SparkCLRSamples.SparkContext.Parallelize(
new[]
{
new KeyValuePair<string, string[]>("a", new[]{"apple", "banana", "lemon"}),
new KeyValuePair<string, string[]>("b", new[]{"grapes"})
new Tuple<string, string[]>("a", new[]{"apple", "banana", "lemon"}),
new Tuple<string, string[]>("b", new[]{"grapes"})
}, 2)
.MapValues(x => x.Length).Collect();
@ -380,8 +381,8 @@ namespace Microsoft.Spark.CSharp.Samples
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.IsTrue(mapValues.Any(kv => kv.Key == "a" && kv.Value == 3));
Assert.IsTrue(mapValues.Any(kv => kv.Key == "b" && kv.Value == 1));
Assert.IsTrue(mapValues.Any(kv => kv.Item1 == "a" && kv.Item2 == 3));
Assert.IsTrue(mapValues.Any(kv => kv.Item1 == "b" && kv.Item2 == 1));
}
}
@ -391,8 +392,8 @@ namespace Microsoft.Spark.CSharp.Samples
var flatMapValues = SparkCLRSamples.SparkContext.Parallelize(
new[]
{
new KeyValuePair<string, string[]>("a", new[]{"x", "y", "z"}),
new KeyValuePair<string, string[]>("b", new[]{"p", "r"})
new Tuple<string, string[]>("a", new[]{"x", "y", "z"}),
new Tuple<string, string[]>("b", new[]{"p", "r"})
}, 2)
.FlatMapValues(x => x).Collect();
@ -401,48 +402,48 @@ namespace Microsoft.Spark.CSharp.Samples
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.IsTrue(flatMapValues.Any(kv => kv.Key == "a" && kv.Value == "x"));
Assert.IsTrue(flatMapValues.Any(kv => kv.Key == "a" && kv.Value == "y"));
Assert.IsTrue(flatMapValues.Any(kv => kv.Key == "a" && kv.Value == "z"));
Assert.IsTrue(flatMapValues.Any(kv => kv.Key == "b" && kv.Value == "p"));
Assert.IsTrue(flatMapValues.Any(kv => kv.Key == "b" && kv.Value == "r"));
Assert.IsTrue(flatMapValues.Any(kv => kv.Item1 == "a" && kv.Item2 == "x"));
Assert.IsTrue(flatMapValues.Any(kv => kv.Item1 == "a" && kv.Item2 == "y"));
Assert.IsTrue(flatMapValues.Any(kv => kv.Item1 == "a" && kv.Item2 == "z"));
Assert.IsTrue(flatMapValues.Any(kv => kv.Item1 == "b" && kv.Item2 == "p"));
Assert.IsTrue(flatMapValues.Any(kv => kv.Item1 == "b" && kv.Item2 == "r"));
}
}
[Sample]
internal static void PairRDDGroupWithSample()
{
var x = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4)}, 2);
var y = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<string, int>("a", 2)}, 1);
var x = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<string, int>("a", 1), new Tuple<string, int>("b", 4)}, 2);
var y = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<string, int>("a", 2)}, 1);
var groupWith = x.GroupWith(y).Collect();
foreach (var kv in groupWith)
Console.WriteLine(kv.Key + ", " + "(" + string.Join(",", kv.Value) + ")");
Console.WriteLine(kv.Item1 + ", " + "(" + string.Join(",", kv.Item2) + ")");
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.IsTrue(groupWith.Any(kv => kv.Key == "a" && kv.Value.Item1[0] == 1 && kv.Value.Item2[0] == 2));
Assert.IsTrue(groupWith.Any(kv => kv.Key == "b" && kv.Value.Item1[0] == 4 && !kv.Value.Item2.Any()));
Assert.IsTrue(groupWith.Any(kv => kv.Item1 == "a" && kv.Item2.Item1[0] == 1 && kv.Item2.Item2[0] == 2));
Assert.IsTrue(groupWith.Any(kv => kv.Item1 == "b" && kv.Item2.Item1[0] == 4 && !kv.Item2.Item2.Any()));
}
}
[Sample]
internal static void PairRDDGroupWithSample2()
{
var x = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<string, int>("a", 5), new KeyValuePair<string, int>("b", 6) }, 2);
var y = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 2);
var z = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<string, int>("a", 2) }, 1);
var x = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<string, int>("a", 5), new Tuple<string, int>("b", 6) }, 2);
var y = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<string, int>("a", 1), new Tuple<string, int>("b", 4) }, 2);
var z = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<string, int>("a", 2) }, 1);
var groupWith = x.GroupWith(y, z).Collect();
foreach (var kv in groupWith)
Console.WriteLine(kv.Key + ", " + "(" + string.Join(",", kv.Value) + ")");
Console.WriteLine(kv.Item1 + ", " + "(" + string.Join(",", kv.Item2) + ")");
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.IsTrue(groupWith.Any(kv => kv.Key == "a" && kv.Value.Item1[0] == 5 && kv.Value.Item2[0] == 1 && kv.Value.Item3[0] == 2));
Assert.IsTrue(groupWith.Any(kv => kv.Key == "b" && kv.Value.Item1[0] == 6 && kv.Value.Item2[0] == 4 && !kv.Value.Item3.Any()));
Assert.IsTrue(groupWith.Any(kv => kv.Item1 == "a" && kv.Item2.Item1[0] == 5 && kv.Item2.Item2[0] == 1 && kv.Item2.Item3[0] == 2));
Assert.IsTrue(groupWith.Any(kv => kv.Item1 == "b" && kv.Item2.Item1[0] == 6 && kv.Item2.Item2[0] == 4 && !kv.Item2.Item3.Any()));
}
}
@ -452,7 +453,7 @@ namespace Microsoft.Spark.CSharp.Samples
//{
// var fractions = new Dictionary<string, double> { { "a", 0.2 }, { "b", 0.1 } };
// var rdd = SparkCLRSamples.SparkContext.Parallelize(fractions.Keys.ToArray(), 2).Cartesian(SparkCLRSamples.SparkContext.Parallelize(Enumerable.Range(0, 1000), 2));
// var sample = rdd.Map(t => new KeyValuePair<string, int>(t.Item1, t.Item2)).SampleByKey(false, fractions, 2).GroupByKey().Collect();
// var sample = rdd.Map(t => new Tuple<string, int>(t.Item1, t.Item2)).SampleByKey(false, fractions, 2).GroupByKey().Collect();
// Console.WriteLine(sample);
//}
@ -460,8 +461,8 @@ namespace Microsoft.Spark.CSharp.Samples
[Sample]
internal static void PairRDDSubtractByKeySample()
{
var x = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<string, int?>("a", 1), new KeyValuePair<string, int?>("b", 4), new KeyValuePair<string, int?>("b", 5), new KeyValuePair<string, int?>("a", 2) }, 2);
var y = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<string, int?>("a", 3), new KeyValuePair<string, int?>("c", null) }, 2);
var x = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<string, int?>("a", 1), new Tuple<string, int?>("b", 4), new Tuple<string, int?>("b", 5), new Tuple<string, int?>("a", 2) }, 2);
var y = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<string, int?>("a", 3), new Tuple<string, int?>("c", null) }, 2);
var subtractByKey = x.SubtractByKey(y).Collect();
@ -471,15 +472,15 @@ namespace Microsoft.Spark.CSharp.Samples
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.AreEqual(2, subtractByKey.Length);
subtractByKey.Contains(new KeyValuePair<string, int?>("b", 4));
subtractByKey.Contains(new KeyValuePair<string, int?>("b", 5));
subtractByKey.Contains(new Tuple<string, int?>("b", 4));
subtractByKey.Contains(new Tuple<string, int?>("b", 5));
}
}
[Sample]
internal static void PairRDDLookupSample()
{
var rdd = SparkCLRSamples.SparkContext.Parallelize(Enumerable.Range(0, 1000).Zip(Enumerable.Range(0, 1000), (x, y) => new KeyValuePair<int, int>(x, y)), 10);
var rdd = SparkCLRSamples.SparkContext.Parallelize(Enumerable.Range(0, 1000).Zip(Enumerable.Range(0, 1000), (x, y) => new Tuple<int, int>(x, y)), 10);
var lookup42 = rdd.Lookup(42);
var lookup1024 = rdd.Lookup(1024);
Console.WriteLine(string.Join(",", lookup42));
@ -495,9 +496,9 @@ namespace Microsoft.Spark.CSharp.Samples
[Sample]
internal static void PairRDDSortByKeySample()
{
var rdd = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<string, int>("B", 2),
new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("c", 3),
new KeyValuePair<string, int>("E", 5), new KeyValuePair<string, int>("D", 4)}, 3);
var rdd = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<string, int>("B", 2),
new Tuple<string, int>("a", 1), new Tuple<string, int>("c", 3),
new Tuple<string, int>("E", 5), new Tuple<string, int>("D", 4)}, 3);
var sortedRdd = rdd.SortByKey(true, 2);
var sortedInTotal = sortedRdd.Collect();
@ -507,7 +508,7 @@ namespace Microsoft.Spark.CSharp.Samples
{
Assert.AreEqual(2, sortedPartitions.Length);
// by default SortByKey is case sensitive
CollectionAssert.AreEqual(new[] { "B", "D", "E", "a", "c" }, sortedInTotal.Select(kv => kv.Key).ToArray());
CollectionAssert.AreEqual(new[] { "B", "D", "E", "a", "c" }, sortedInTotal.Select(kv => kv.Item1).ToArray());
}
// convert the keys to lower case in order to sort with case insensitive
@ -518,7 +519,7 @@ namespace Microsoft.Spark.CSharp.Samples
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.AreEqual(2, sortedPartitions.Length);
CollectionAssert.AreEqual(new[] { "a", "B", "c", "D", "E" }, sortedInTotal.Select(kv => kv.Key).ToArray());
CollectionAssert.AreEqual(new[] { "a", "B", "c", "D", "E" }, sortedInTotal.Select(kv => kv.Item1).ToArray());
}
}
}

Просмотреть файл

@ -3,6 +3,7 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using Microsoft.Spark.CSharp.Core;
@ -129,7 +130,7 @@ namespace Microsoft.Spark.CSharp.Samples
var rdd = SparkCLRSamples.SparkContext.Parallelize(new int[] { 1, 1, 2, 3, 5, 8 }, 1);
var groups = rdd.GroupBy(x => x % 2).Collect();
foreach (var kv in groups)
Console.WriteLine(kv.Key + ", " + string.Join(",", kv.Value));
Console.WriteLine(kv.Item1 + ", " + string.Join(",", kv.Item2));
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
@ -137,9 +138,9 @@ namespace Microsoft.Spark.CSharp.Samples
foreach (var kv in groups)
{
// the group with key=1 is odd numbers
if (kv.Key == 1) CollectionAssert.AreEquivalent(new[] { 1, 1, 3, 5 }, kv.Value);
if (kv.Item1 == 1) CollectionAssert.AreEquivalent(new[] { 1, 1, 3, 5 }, kv.Item2);
// the group with key=0 is even numbers
else if (kv.Key == 0) CollectionAssert.AreEquivalent(new[] { 2, 8 }, kv.Value);
else if (kv.Item1 == 0) CollectionAssert.AreEquivalent(new[] { 2, 8 }, kv.Item2);
}
}
}
@ -221,7 +222,10 @@ namespace Microsoft.Spark.CSharp.Samples
[Sample]
internal static void RDDCountByValueSample()
{
var countByValue = SparkCLRSamples.SparkContext.Parallelize(new int[] { 1, 2, 1, 2, 2 }, 2).CountByValue();
var countByValue = SparkCLRSamples.SparkContext.Parallelize(new int[] { 1, 2, 1, 2, 2 }, 2)
.CountByValue()
.ToDictionary(k => k.Item1, v => v.Item2);
foreach (var item in countByValue)
Console.WriteLine(item);
@ -292,10 +296,10 @@ namespace Microsoft.Spark.CSharp.Samples
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.IsTrue(keyBy.Contains(new KeyValuePair<int, int>(1, 1)));
Assert.IsTrue(keyBy.Contains(new KeyValuePair<int, int>(4, 2)));
Assert.IsTrue(keyBy.Contains(new KeyValuePair<int, int>(9, 3)));
Assert.IsTrue(keyBy.Contains(new KeyValuePair<int, int>(16, 4)));
Assert.IsTrue(keyBy.Contains(new Tuple<int, int>(1, 1)));
Assert.IsTrue(keyBy.Contains(new Tuple<int, int>(4, 2)));
Assert.IsTrue(keyBy.Contains(new Tuple<int, int>(9, 3)));
Assert.IsTrue(keyBy.Contains(new Tuple<int, int>(16, 4)));
}
}
@ -344,7 +348,7 @@ namespace Microsoft.Spark.CSharp.Samples
{
for (int i = 0; i < 5; i++)
{
Assert.IsTrue(zip.Contains(new KeyValuePair<int, int>(i, 1000 + i)));
Assert.IsTrue(zip.Contains(new Tuple<int, int>(i, 1000 + i)));
}
}
}
@ -358,10 +362,10 @@ namespace Microsoft.Spark.CSharp.Samples
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.IsTrue(zipWithIndex.Contains(new KeyValuePair<string, long>("a", 0)));
Assert.IsTrue(zipWithIndex.Contains(new KeyValuePair<string, long>("b", 1)));
Assert.IsTrue(zipWithIndex.Contains(new KeyValuePair<string, long>("c", 2)));
Assert.IsTrue(zipWithIndex.Contains(new KeyValuePair<string, long>("d", 3)));
Assert.IsTrue(zipWithIndex.Contains(new Tuple<string, long>("a", 0)));
Assert.IsTrue(zipWithIndex.Contains(new Tuple<string, long>("b", 1)));
Assert.IsTrue(zipWithIndex.Contains(new Tuple<string, long>("c", 2)));
Assert.IsTrue(zipWithIndex.Contains(new Tuple<string, long>("d", 3)));
}
}
@ -374,11 +378,11 @@ namespace Microsoft.Spark.CSharp.Samples
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.IsTrue(zipWithUniqueId.Contains(new KeyValuePair<string, long>("a", 0)));
Assert.IsTrue(zipWithUniqueId.Contains(new KeyValuePair<string, long>("b", 1)));
Assert.IsTrue(zipWithUniqueId.Contains(new KeyValuePair<string, long>("c", 4)));
Assert.IsTrue(zipWithUniqueId.Contains(new KeyValuePair<string, long>("d", 2)));
Assert.IsTrue(zipWithUniqueId.Contains(new KeyValuePair<string, long>("e", 5)));
Assert.IsTrue(zipWithUniqueId.Contains(new Tuple<string, long>("a", 0)));
Assert.IsTrue(zipWithUniqueId.Contains(new Tuple<string, long>("b", 1)));
Assert.IsTrue(zipWithUniqueId.Contains(new Tuple<string, long>("c", 4)));
Assert.IsTrue(zipWithUniqueId.Contains(new Tuple<string, long>("d", 2)));
Assert.IsTrue(zipWithUniqueId.Contains(new Tuple<string, long>("e", 5)));
}
}
@ -530,22 +534,22 @@ namespace Microsoft.Spark.CSharp.Samples
var words = lines.FlatMap(s => s.Split(' '));
var wordCounts = words.Map(w => new KeyValuePair<string, int>(w.Trim(), 1))
var wordCounts = words.Map(w => new Tuple<string, int>(w.Trim(), 1))
.ReduceByKey((x, y) => x + y).Collect();
Console.WriteLine("*** Printing words and their counts ***");
foreach (var kvp in wordCounts)
{
Console.WriteLine("'{0}':{1}", kvp.Key, kvp.Value);
Console.WriteLine("'{0}':{1}", kvp.Item1, kvp.Item2);
}
var wordCountsCaseInsensitve = words.Map(w => new KeyValuePair<string, int>(w.ToLower().Trim(), 1))
var wordCountsCaseInsensitve = words.Map(w => new Tuple<string, int>(w.ToLower().Trim(), 1))
.ReduceByKey((x, y) => x + y).Collect();
Console.WriteLine("*** Printing words and their counts ignoring case ***");
foreach (var kvp in wordCountsCaseInsensitve)
{
Console.WriteLine("'{0}':{1}", kvp.Key, kvp.Value);
Console.WriteLine("'{0}':{1}", kvp.Item1, kvp.Item2);
}
if (SparkCLRSamples.Configuration.IsValidationEnabled)
@ -553,7 +557,7 @@ namespace Microsoft.Spark.CSharp.Samples
var dictionary = new Dictionary<string, int>();
foreach (var kvp in wordCounts)
{
dictionary[kvp.Key] = kvp.Value;
dictionary[kvp.Item1] = kvp.Item2;
}
Assert.AreEqual(22, dictionary["the"]);
@ -563,7 +567,7 @@ namespace Microsoft.Spark.CSharp.Samples
var caseInsenstiveWordCountDictionary = new Dictionary<string, int>();
foreach (var kvp in wordCountsCaseInsensitve)
{
caseInsenstiveWordCountDictionary[kvp.Key] = kvp.Value;
caseInsenstiveWordCountDictionary[kvp.Item1] = kvp.Item2;
}
Assert.AreEqual(45, caseInsenstiveWordCountDictionary["the"]);
@ -584,12 +588,12 @@ namespace Microsoft.Spark.CSharp.Samples
var requestsColumns = requests.Map(s =>
{
var columns = s.Split(',');
return new KeyValuePair<string, string[]>(columns[0], new[] { columns[1], columns[2], columns[3] });
return new Tuple<string, string[]>(columns[0], new[] { columns[1], columns[2], columns[3] });
});
var metricsColumns = metrics.Map(s =>
{
var columns = s.Split(',');
return new KeyValuePair<string, string[]>(columns[3], new[] { columns[4], columns[5], columns[6] });
return new Tuple<string, string[]>(columns[3], new[] { columns[4], columns[5], columns[6] });
});
var requestsJoinedWithMetrics = requestsColumns.Join(metricsColumns)
@ -597,29 +601,29 @@ namespace Microsoft.Spark.CSharp.Samples
s =>
new []
{
s.Key, //guid
s.Value.Item1[0], s.Value.Item1[1], s.Value.Item1[2], //dc, abtestid, traffictype
s.Value.Item2[0],s.Value.Item2[1], s.Value.Item2[2] //lang, country, metric
s.Item1, //guid
s.Item2.Item1[0], s.Item2.Item1[1], s.Item2.Item1[2], //dc, abtestid, traffictype
s.Item2.Item2[0],s.Item2.Item2[1], s.Item2.Item2[2] //lang, country, metric
});
var latencyByDatacenter = requestsJoinedWithMetrics.Map(i => new KeyValuePair<string, int> (i[1], int.Parse(i[6]))); //key is "datacenter"
var latencyByDatacenter = requestsJoinedWithMetrics.Map(i => new Tuple<string, int> (i[1], int.Parse(i[6]))); //key is "datacenter"
var maxLatencyByDataCenterList = latencyByDatacenter.ReduceByKey(Math.Max).Collect();
Console.WriteLine("***** Max latency metrics by DC *****");
foreach (var keyValuePair in maxLatencyByDataCenterList)
foreach (var Tuple in maxLatencyByDataCenterList)
{
Console.WriteLine("Datacenter={0}, Max latency={1}", keyValuePair.Key, keyValuePair.Value);
Console.WriteLine("Datacenter={0}, Max latency={1}", Tuple.Item1, Tuple.Item2);
}
var latencyAndCountByDatacenter = requestsJoinedWithMetrics.Map(i => new KeyValuePair<string, Tuple<int,int>> (i[1], new Tuple<int, int>(int.Parse(i[6]), 1)));
var latencyAndCountByDatacenter = requestsJoinedWithMetrics.Map(i => new Tuple<string, Tuple<int,int>> (i[1], new Tuple<int, int>(int.Parse(i[6]), 1)));
var sumLatencyAndCountByDatacenter = latencyAndCountByDatacenter.ReduceByKey((tuple, tuple1) => new Tuple<int, int>((tuple == null ? 0 : tuple.Item1) + tuple1.Item1, (tuple == null ? 0 : tuple.Item2) + tuple1.Item2));
var sumLatencyAndCountByDatacenterList = sumLatencyAndCountByDatacenter.Collect();
Console.WriteLine("***** Mean latency metrics by DC *****");
foreach (var keyValuePair in sumLatencyAndCountByDatacenterList)
foreach (var Tuple in sumLatencyAndCountByDatacenterList)
{
Console.WriteLine("Datacenter={0}, Mean latency={1}", keyValuePair.Key, keyValuePair.Value.Item1/keyValuePair.Value.Item2);
Console.WriteLine("Datacenter={0}, Mean latency={1}", Tuple.Item1, Tuple.Item2.Item1/Tuple.Item2.Item2);
}
if (SparkCLRSamples.Configuration.IsValidationEnabled)
@ -627,7 +631,7 @@ namespace Microsoft.Spark.CSharp.Samples
var dictionary = new Dictionary<string, int>();
foreach (var kvp in maxLatencyByDataCenterList)
{
dictionary[kvp.Key] = kvp.Value;
dictionary[kvp.Item1] = kvp.Item2;
}
Assert.AreEqual(835, dictionary["iowa"]);
@ -636,7 +640,7 @@ namespace Microsoft.Spark.CSharp.Samples
var meanDictionary = new Dictionary<string, Tuple<int, int>>();
foreach (var kvp in sumLatencyAndCountByDatacenterList)
{
meanDictionary[kvp.Key] = new Tuple<int, int>(kvp.Value.Item1, kvp.Value.Item2);
meanDictionary[kvp.Item1] = new Tuple<int, int>(kvp.Item2.Item1, kvp.Item2.Item2);
}
Assert.AreEqual(1621, meanDictionary["iowa"].Item1);
@ -737,7 +741,7 @@ namespace Microsoft.Spark.CSharp.Samples
var markets = SparkCLRSamples.SparkContext.TextFile(SparkCLRSamples.Configuration.GetInputDataPath("market.tab"), 1);
long totalMarketsCount = markets.Count();
var marketsByKey = markets.Map(x => new KeyValuePair<string, string>(x.Substring(0, x.IndexOf('-')), x));
var marketsByKey = markets.Map(x => new Tuple<string, string>(x.Substring(0, x.IndexOf('-')), x));
var categories = marketsByKey.PartitionBy(2)
.CombineByKey(() => "", (c, v) => v.Substring(0, v.IndexOf('-')), (c1, c2) => c1, 2);
var categoriesCollectedCount = categories.Collect().Count();

Просмотреть файл

@ -185,5 +185,31 @@ namespace Microsoft.Spark.CSharp.Samples
Assert.AreEqual(schemaPeople.Json, dataFramePeople.Schema.Json);
}
}
[Sample]
internal static void SparkSessionUdfSample()
{
GetSparkSession().Udf.RegisterFunction<string, string, string>("FullAddress", (city, state) => city + " " + state);
GetSparkSession().Udf.RegisterFunction<bool, string, int>("PeopleFilter", (name, age) => name == "Bill" && age > 80);
var peopleDataFrame = GetSparkSession().Read().Json(SparkCLRSamples.Configuration.GetInputDataPath(DataFrameSamples.PeopleJson));
var functionAppliedDF = peopleDataFrame.SelectExpr("name", "age * 2 as age",
"FullAddress(address.city, address.state) as address")
.Where("PeopleFilter(name, age)");
functionAppliedDF.ShowSchema();
functionAppliedDF.Show();
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
var collected = functionAppliedDF.Collect().ToArray();
CollectionAssert.AreEquivalent(new[] { "name", "age", "address" },
functionAppliedDF.Schema.Fields.Select(f => f.Name).ToArray());
Assert.AreEqual(1, collected.Length);
Assert.AreEqual("Bill", collected[0].Get("name"));
Assert.AreEqual(86, collected[0].Get("age"));
Assert.AreEqual("Seattle Washington", collected[0].Get("address"));
}
}
}
}

Просмотреть файл

@ -533,7 +533,7 @@ namespace Microsoft.Spark.CSharp
.GetField("value", BindingFlags.NonPublic | BindingFlags.Instance)
.GetValue(item.Value);
logger.LogDebug("({0}, {1})", item.Key, value);
formatter.Serialize(ms, new KeyValuePair<int, dynamic>(item.Key, value));
formatter.Serialize(ms, new Tuple<int, dynamic>(item.Key, value));
byte[] buffer = ms.ToArray();
SerDe.Write(networkStream, buffer.Length);
SerDe.Write(networkStream, buffer);
@ -649,7 +649,7 @@ namespace Microsoft.Spark.CSharp
}
watch.Stop();
yield return new KeyValuePair<byte[], byte[]>(pairKey, pairValue);
yield return new Tuple<byte[], byte[]>(pairKey, pairValue);
break;
}

Просмотреть файл

@ -573,7 +573,7 @@ namespace WorkerTest
{
WritePayloadHeaderToWorker(s);
byte[] command = SparkContext.BuildCommand(
new CSharpWorkerFunc((pid, iter) => iter.Cast<KeyValuePair<byte[], byte[]>>().Select(pair => pair.Key)),
new CSharpWorkerFunc((pid, iter) => iter.Cast<Tuple<byte[], byte[]>>().Select(pair => pair.Item1)),
SerializedMode.Pair, SerializedMode.None);
SerDe.Write(s, command.Length);
@ -713,7 +713,7 @@ namespace WorkerTest
/// <summary>
/// read accumulator
/// </summary>
private IEnumerable<KeyValuePair<int, dynamic>> ReadAccumulator(Stream s, int expectedCount = 0)
private IEnumerable<Tuple<int, dynamic>> ReadAccumulator(Stream s, int expectedCount = 0)
{
int count = 0;
var formatter = new BinaryFormatter();
@ -723,7 +723,7 @@ namespace WorkerTest
if (length > 0)
{
var ms = new MemoryStream(SerDe.ReadBytes(s, length));
yield return (KeyValuePair<int, dynamic>)formatter.Deserialize(ms);
yield return (Tuple<int, dynamic>)formatter.Deserialize(ms);
if (expectedCount > 0 && ++count >= expectedCount)
{
@ -780,8 +780,8 @@ namespace WorkerTest
int accumulatorsCount = SerDe.ReadInt(s);
Assert.IsTrue(accumulatorsCount == 1);
var accumulatorFromWorker = ReadAccumulator(s, accumulatorsCount).First();
Assert.AreEqual(accumulatorId, accumulatorFromWorker.Key);
Assert.AreEqual(expectedCount, accumulatorFromWorker.Value);
Assert.AreEqual(accumulatorId, accumulatorFromWorker.Item1);
Assert.AreEqual(expectedCount, accumulatorFromWorker.Item2);
SerDe.ReadInt(s);
}

Просмотреть файл

@ -32,17 +32,17 @@
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<ItemGroup>
<Reference Include="CSharpWorker, Version=1.5.2.0, Culture=neutral, processorArchitecture=MSIL">
<Reference Include="CSharpWorker">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
</Reference>
<Reference Include="log4net, Version=1.2.15.0, Culture=neutral, PublicKeyToken=669e0ddf0bb1aa2a, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Spark.CSharp.Adapter, Version=1.6.1.0, Culture=neutral, processorArchitecture=MSIL">
<Reference Include="Microsoft.Spark.CSharp.Adapter">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
</Reference>
<Reference Include="Newtonsoft.Json, Version=4.5.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
@ -67,7 +67,7 @@
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe.config">
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe.config">
<Link>CSharpWorker.exe.config</Link>
</None>
<None Include="..\..\App.config">

Просмотреть файл

@ -4,5 +4,5 @@
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
</packages>

Просмотреть файл

@ -35,17 +35,17 @@
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<ItemGroup>
<Reference Include="CSharpWorker, Version=1.5.2.0, Culture=neutral, processorArchitecture=MSIL">
<Reference Include="CSharpWorker">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
</Reference>
<Reference Include="log4net, Version=1.2.15.0, Culture=neutral, PublicKeyToken=669e0ddf0bb1aa2a, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Spark.CSharp.Adapter, Version=1.6.1.0, Culture=neutral, processorArchitecture=MSIL">
<Reference Include="Microsoft.Spark.CSharp.Adapter">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
</Reference>
<Reference Include="Newtonsoft.Json, Version=4.5.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
@ -66,7 +66,7 @@
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe.config">
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe.config">
<Link>CSharpWorker.exe.config</Link>
</None>
<None Include="..\..\App.config">

Просмотреть файл

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="log4net" version="2.0.5" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />

Просмотреть файл

@ -35,13 +35,13 @@
</PropertyGroup>
<ItemGroup>
<Reference Include="CSharpWorker">
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
</Reference>
<Reference Include="log4net">
<HintPath>..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Spark.CSharp.Adapter">
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
</Reference>
<Reference Include="Newtonsoft.Json, Version=4.5.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>

Просмотреть файл

@ -4,5 +4,5 @@
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
</packages>

Просмотреть файл

@ -37,12 +37,12 @@
<SpecificVersion>False</SpecificVersion>
<HintPath>..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll</HintPath>
</Reference>
<Reference Include="CSharpWorker, Version=1.6.1.0, Culture=neutral, processorArchitecture=MSIL">
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
<Reference Include="CSharpWorker">
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
<Private>True</Private>
</Reference>
<Reference Include="Microsoft.Spark.CSharp.Adapter, Version=1.6.1.0, Culture=neutral, processorArchitecture=MSIL">
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
<Reference Include="Microsoft.Spark.CSharp.Adapter">
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
<Private>True</Private>
</Reference>
<Reference Include="System" />

Просмотреть файл

@ -1,4 +1,4 @@
// Copyright (c) Microsoft. All rights reserved.
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
@ -20,21 +20,40 @@ namespace Microsoft.Spark.CSharp.Examples
{
LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set
var logger = LoggerServiceFactory.GetLogger(typeof(HiveDataFrameExample));
var sparkConf = new SparkConf();
var sparkContext = new SparkContext(sparkConf);
var hiveContext = new HiveContext(sparkContext);
// please give the path to input json file
var jsonFilePath = args[0];
var peopleDataFrame = hiveContext.Read().Json(jsonFilePath);
const string dbName = "SampleHiveDataBaseForMobius";
const string tableName = "people";
var builder = SparkSession.Builder().EnableHiveSupport();
// The following setting is required to use Spark 2.0 in Windows
// It may be provided in command line when running Mobius app
//builder = builder.Config("spark.sql.warehouse.dir", "<hdfs or local path>");
var session = builder.GetOrCreate();
var peopleDataFrame = session.Read().Json(jsonFilePath);
session.Sql(string.Format("CREATE DATABASE IF NOT EXISTS {0}", dbName)); // create database if not exists
session.Sql(string.Format("USE {0}", dbName));
//hiveContext.Sql(string.Format("DROP TABLE {0}", tableName)); // drop table if exists
peopleDataFrame.Write().Mode(SaveMode.Overwrite).SaveAsTable(tableName); // create table
var tablesDataFrame = session.Table(tableName); // get all tables in database
logger.LogInfo(string.Format("table count in database {0}: {1}", dbName, tablesDataFrame.Count()));
tablesDataFrame.Show();
session.Sql(string.Format("SELECT * FROM {0}", tableName)).Show(); // select from table
// Following example is for the deprecated API
/*
var sparkConf = new SparkConf();
// The following setting is required to use Spark 2.0 in Windows
// It may be provided in command line when running Mobius app
//sparkConf.Set("spark.sql.warehouse.dir", @"<hdfs or local path>");
var sparkContext = new SparkContext(sparkConf);
var hiveContext = new HiveContext(sparkContext);
var peopleDataFrame = hiveContext.Read().Json(jsonFilePath);
hiveContext.Sql(string.Format("CREATE DATABASE IF NOT EXISTS {0}", dbName)); // create database if not exists
hiveContext.Sql(string.Format("USE {0}", dbName));
hiveContext.Sql(string.Format("DROP TABLE {0}", tableName)); // drop table if exists
//hiveContext.Sql(string.Format("DROP TABLE {0}", tableName)); // drop table if exists
peopleDataFrame.Write().Mode(SaveMode.Overwrite).SaveAsTable(tableName); // create table
var tablesDataFrame = hiveContext.Tables(dbName); // get all tables in database
@ -42,6 +61,7 @@ namespace Microsoft.Spark.CSharp.Examples
tablesDataFrame.Show();
hiveContext.Sql(string.Format("SELECT * FROM {0}", tableName)).Show(); // select from table
*/
}
}
}

Просмотреть файл

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="log4net" version="2.0.5" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net452" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net452" />
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />

Просмотреть файл

@ -34,17 +34,17 @@
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<ItemGroup>
<Reference Include="CSharpWorker, Version=1.5.2.0, Culture=neutral, processorArchitecture=MSIL">
<Reference Include="CSharpWorker">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
</Reference>
<Reference Include="log4net, Version=1.2.15.0, Culture=neutral, PublicKeyToken=669e0ddf0bb1aa2a, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Spark.CSharp.Adapter, Version=1.5.2.0, Culture=neutral, processorArchitecture=MSIL">
<Reference Include="Microsoft.Spark.CSharp.Adapter">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
</Reference>
<Reference Include="Newtonsoft.Json, Version=4.5.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
@ -65,7 +65,7 @@
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe.config">
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe.config">
<Link>CSharpWorker.exe.config</Link>
</None>
<None Include="..\..\App.config">

Просмотреть файл

@ -4,5 +4,5 @@
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
</packages>

Просмотреть файл

@ -34,17 +34,17 @@
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<ItemGroup>
<Reference Include="CSharpWorker, Version=1.5.2.0, Culture=neutral, processorArchitecture=MSIL">
<Reference Include="CSharpWorker">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
</Reference>
<Reference Include="log4net, Version=1.2.15.0, Culture=neutral, PublicKeyToken=669e0ddf0bb1aa2a, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Spark.CSharp.Adapter, Version=1.5.2.0, Culture=neutral, processorArchitecture=MSIL">
<Reference Include="Microsoft.Spark.CSharp.Adapter">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
</Reference>
<Reference Include="Newtonsoft.Json, Version=4.5.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
@ -65,7 +65,7 @@
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe.config">
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe.config">
<Link>CSharpWorker.exe.config</Link>
</None>
<None Include="..\..\App.config">

Просмотреть файл

@ -4,5 +4,5 @@
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
</packages>

Просмотреть файл

@ -35,7 +35,7 @@
</PropertyGroup>
<ItemGroup>
<Reference Include="CSharpWorker">
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
</Reference>
<Reference Include="log4net, Version=1.2.15.0, Culture=neutral, PublicKeyToken=669e0ddf0bb1aa2a, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
@ -43,7 +43,7 @@
</Reference>
<Reference Include="Microsoft.CSharp" />
<Reference Include="Microsoft.Spark.CSharp.Adapter">
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
</Reference>
<Reference Include="Newtonsoft.Json, Version=4.5.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
@ -68,7 +68,7 @@
<Compile Include="EventPublisher.cs" />
</ItemGroup>
<ItemGroup>
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe.config">
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe.config">
<Link>CSharpWorker.exe.config</Link>
</None>
<None Include="..\..\App.config">

Просмотреть файл

@ -4,7 +4,7 @@
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
<!-- *** ****************************************************************** *** -->
<!-- *** Following references are needed for publishing events to EventHubs *** -->

Просмотреть файл

@ -38,7 +38,7 @@
</Reference>
<Reference Include="CSharpWorker">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
</Reference>
<Reference Include="log4net">
<SpecificVersion>False</SpecificVersion>
@ -46,7 +46,7 @@
</Reference>
<Reference Include="Microsoft.Spark.CSharp.Adapter">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
</Reference>
<Reference Include="Razorvine.Pyrolite">
<SpecificVersion>False</SpecificVersion>
@ -64,7 +64,7 @@
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe.config">
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe.config">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Include="..\..\App.config">

Просмотреть файл

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="log4net" version="2.0.5" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />

Просмотреть файл

@ -33,14 +33,14 @@
</PropertyGroup>
<ItemGroup>
<Reference Include="CSharpWorker">
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
</Reference>
<Reference Include="log4net, Version=1.2.10.0, Culture=neutral, PublicKeyToken=1b44e1d426115821, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Spark.CSharp.Adapter">
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
</Reference>
<Reference Include="Newtonsoft.Json, Version=4.5.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>

Просмотреть файл

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="log4net" version="2.0.5" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />

Просмотреть файл

@ -66,13 +66,13 @@
</ItemGroup>
<ItemGroup>
<Reference Include="CSharpWorker">
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
</Reference>
<Reference Include="log4net">
<HintPath>..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Spark.CSharp.Adapter">
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
</Reference>
<Reference Include="mscorlib" />
<Reference Include="FSharp.Core, Version=$(TargetFSharpCoreVersion), Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a">

Просмотреть файл

@ -2,7 +2,7 @@
<packages>
<package id="FSharp.Core" version="4.0.0.1" targetFramework="net45" />
<package id="log4net" version="2.0.5" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />

Просмотреть файл

@ -71,7 +71,7 @@
</ItemGroup>
<ItemGroup>
<Reference Include="CSharpWorker">
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
<Private>True</Private>
</Reference>
<Reference Include="FSharp.Core">
@ -83,7 +83,7 @@
<Private>True</Private>
</Reference>
<Reference Include="Microsoft.Spark.CSharp.Adapter">
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
<Private>True</Private>
</Reference>
<Reference Include="mscorlib" />

Просмотреть файл

@ -2,7 +2,7 @@
<packages>
<package id="FSharp.Core" version="4.0.0.1" targetFramework="net45" />
<package id="log4net" version="2.0.5" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />

Просмотреть файл

@ -6,7 +6,8 @@
* Maven 3.0.5 or above.
* Mono 4.2 stable or above. The download and installation instructions for Mono are available in [http://www.mono-project.com/download/#download-lin](http://www.mono-project.com/download/#download-lin) (see [Debian, Ubuntu and derivatives](http://www.mono-project.com/docs/getting-started/install/linux/#debian-ubuntu-and-derivatives) or [CentOS, Fedora, similar Linux distributions or OS X](http://www.mono-project.com/docs/getting-started/install/linux/#centos-7-fedora-19-and-later-and-derivatives))
* F# for Mono. The download and installation instructions for the F# Mono extension are available in [http://fsharp.org/use/linux/](http://fsharp.org/use/linux/)
* NuGet.
* NuGet
* wget
* XSLTPROC
The following environment variables should be set properly:
@ -15,9 +16,10 @@ The following environment variables should be set properly:
## Instructions
Instructions to build Mobius in Linux are same as [instructions for Windows](./windows-instructions.md#instructions). The only change required is to use the following script files instead of .cmd files:
* build.sh
* clean.sh
Instructions to build Mobius on Linux are same as [instructions for Windows](./windows-instructions.md#instructions). The only change required is to use the following script files instead of `.cmd` files:
* `build.sh`
* `clean.sh`
# Running Unit Tests in Linux

31
notes/mobius-init.fsx Normal file
Просмотреть файл

@ -0,0 +1,31 @@
// *** Replace the paths below to point to correct location of Mobius binaries ***
#r @"C:\spark-clr_2.11-2.0.000\runtime\bin\Microsoft.Spark.CSharp.Adapter.dll"
#r @"C:\spark-clr_2.11-2.0.000\runtime\bin\log4net.dll"
#r @"C:\spark-clr_2.11-2.0.000\runtime\bin\Newtonsoft.Json.dll"
#r @"C:\spark-clr_2.11-2.0.000\runtime\bin\Razorvine.Pyrolite.dll"
#r @"C:\spark-clr_2.11-2.0.000\runtime\bin\Razorvine.Serpent.dll"
#r @"C:\spark-clr_2.11-2.0.000\runtime\bin\CSharpWorker.exe"
open Microsoft.Spark.CSharp.Core
open Microsoft.Spark.CSharp.Services
open Microsoft.Spark.CSharp.Sql
open System.Reflection
open System.Collections.Generic
LoggerServiceFactory.SetLoggerService Log4NetLoggerService.Instance
// *** Uncomment & use the following code block to use SqlContext API ***
//let conf = SparkConf().SetAppName "FSharpInteractiveShell"
// *** uncomment & update master URL if running in non-local mode ***
//conf.Master "spark://sparkmaster:7077"
// *** Spark 2.0 in Windows requires the following config ***
//conf.Set("spark.sql.warehouse.dir", @"file:///C:/sparktemp")
//let sc = SparkContext conf
//let sqlContext = SqlContext sc
// *** Uncomment & use the following code block to use SparkSession API ***
let builder = SparkSession.Builder()
builder = builder.AppName "FSharpInteractiveShell"
// *** uncomment & update master URL if running in non-local mode ***
//builder = builder.Master "spark://sparkmaster:7077"
// *** Spark 2.0 in Windows requires the following config ***
builder = builder.Config("spark.sql.warehouse.dir", "file:///C:/sparktemp")
let session = builder.GetOrCreate()

Просмотреть файл

@ -0,0 +1,17 @@
# Implementing Spark Apps in F# using Mobius
## Non-Interactive Apps
1. Develop your application in a F# IDE using Mobius API. Refer to [F# examples](../examples/fsharp) for sample code
2. Use [`sparkclr-submit.cmd`](running-mobius-app.md) to run your Mobius-based Spark application implemented in F#
## Interactive Apps
### Using F# Interactive (fsi.exe)
1. Run `sparkclr-submit.cmd debug` in a command prompt after setting necessary [environment variables](running-mobius-app.md#pre-requisites). Note that this `debug` parameter is a misnomer in this context and this command initializes .NET-JVM bridge similiar to [running Mobius apps in debug mode](./running-mobius-app.md#debug-mode).
2. In Developer Command Prompt for VS, run `fsi.exe --use:c:\temp\mobius-init.fsx`. [mobius-init.fsx](mobius-init.fsx) has the initialization code that can be used to create `SparkContext`, `SqlContext` or `SparkSession`. You need to update the location of Mobius binaries referenced in the beginning of the script file. You may also need to update other configuration settings in the script.
3. When the F# command prompt is available, Spark functionality can be invoked using Mobius API. For example, the following code can be used process JSON file.
```
let dataframe = sparkSession.Read().Json @"C:\temp\data.json";;
dataframe.Show();;
dataframe.ShowSchema();;
dataframe.Count();;
```