merging latest changes from Microsoft/master
This commit is contained in:
Коммит
e14b92ba43
|
@ -2,6 +2,8 @@ language: csharp
|
|||
solution: csharp/SparkCLR.sln
|
||||
sudo: required
|
||||
dist: trusty
|
||||
env:
|
||||
- JDK=openjdk7
|
||||
before_install:
|
||||
- sudo apt-get install xsltproc
|
||||
- nuget install NUnit.Runners -Version 3.0.0 -OutputDirectory testrunner
|
||||
|
@ -12,6 +14,8 @@ before_install:
|
|||
- export M2="$M2_HOME/bin"
|
||||
- export PATH="$M2:$PATH"
|
||||
- hash -r
|
||||
before_script:
|
||||
- jdk_switcher use $JDK
|
||||
script:
|
||||
- export MAVEN_OPTS="-XX:MaxPermSize=2g -Xmx4g"
|
||||
- export JAVA_OPTS="-XX:MaxPermSize=2g -Xmx4g"
|
||||
|
|
|
@ -8,7 +8,7 @@ For example, the word count sample in Apache Spark can be implemented in C# as f
|
|||
```c#
|
||||
var lines = sparkContext.TextFile(@"hdfs://path/to/input.txt");
|
||||
var words = lines.FlatMap(s => s.Split(' '));
|
||||
var wordCounts = words.Map(w => new KeyValuePair<string, int>(w.Trim(), 1))
|
||||
var wordCounts = words.Map(w => new Tuple<string, int>(w.Trim(), 1))
|
||||
.ReduceByKey((x, y) => x + y);
|
||||
var wordCountCollection = wordCounts.Collect();
|
||||
wordCounts.SaveAsTextFile(@"hdfs://path/to/wordcount.txt");
|
||||
|
@ -63,7 +63,7 @@ StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(checkpoint
|
|||
.Map(kvp => Encoding.UTF8.GetString(kvp.Value))
|
||||
.Filter(line => line.Contains(","))
|
||||
.Map(line => line.Split(','))
|
||||
.Map(columns => new KeyValuePair<string, int>(
|
||||
.Map(columns => new Tuple<string, int>(
|
||||
string.Format("{0},{1}", columns[0], columns[1]), 1))
|
||||
.ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y,
|
||||
windowDurationInSecs, slideDurationInSecs, 3)
|
||||
|
@ -119,6 +119,7 @@ Refer to the [docs folder](docs) for design overview and other info on Mobius
|
|||
* [Configuration parameters in Mobius](./notes/configuration-mobius.md)
|
||||
* [Troubleshoot errors in Mobius](./notes/troubleshooting-mobius.md)
|
||||
* [Debug Mobius apps](./notes/running-mobius-app.md#debug-mode)
|
||||
* [Implementing Spark Apps in F# using Mobius](./notes/spark-fsharp-mobius.md)
|
||||
|
||||
## Supported Spark Versions
|
||||
|
||||
|
|
|
@ -49,7 +49,8 @@ if "%precheck%" == "bad" (goto :EOF)
|
|||
@rem
|
||||
set SPARK_VERSION=2.0.0
|
||||
set HADOOP_VERSION=2.6
|
||||
@echo [RunSamples.cmd] SPARK_VERSION=%SPARK_VERSION%, HADOOP_VERSION=%HADOOP_VERSION%
|
||||
set APACHE_DIST_SERVER=archive.apache.org
|
||||
@echo [RunSamples.cmd] SPARK_VERSION=%SPARK_VERSION%, HADOOP_VERSION=%HADOOP_VERSION%, APACHE_DIST_SERVER=%APACHE_DIST_SERVER%
|
||||
|
||||
@rem download runtime dependencies
|
||||
pushd "%CMDHOME%"
|
||||
|
|
|
@ -10,6 +10,9 @@
|
|||
#
|
||||
Param([string] $stage, [string] $verbose)
|
||||
|
||||
$envValue = [Environment]::GetEnvironmentVariable("APACHE_DIST_SERVER")
|
||||
$apacheDistServer = if ($envValue -eq $null) { "archive.apache.org" } else { $envValue }
|
||||
|
||||
if ($stage.ToLower() -eq "run")
|
||||
{
|
||||
# retrieve hadoop and spark versions from environment variables
|
||||
|
@ -19,7 +22,7 @@ if ($stage.ToLower() -eq "run")
|
|||
$envValue = [Environment]::GetEnvironmentVariable("SPARK_VERSION")
|
||||
$sparkVersion = if ($envValue -eq $null) { "1.6.1" } else { $envValue }
|
||||
|
||||
Write-Output "[downloadtools] hadoopVersion=$hadoopVersion, sparkVersion=$sparkVersion"
|
||||
Write-Output "[downloadtools] hadoopVersion=$hadoopVersion, sparkVersion=$sparkVersion, apacheDistServer=$apacheDistServer"
|
||||
}
|
||||
|
||||
function Get-ScriptDirectory
|
||||
|
@ -72,10 +75,18 @@ function Download-File($url, $output)
|
|||
{
|
||||
$output = [System.IO.Path]::GetFullPath($output)
|
||||
if (test-path $output)
|
||||
{
|
||||
if ((Get-Item $output).Length -gt 0)
|
||||
{
|
||||
Write-Output "[downloadtools.Download-File] $output exists. No need to download."
|
||||
return
|
||||
}
|
||||
else
|
||||
{
|
||||
Write-Output "[downloadtools.Download-File] [WARNING] $output exists but is empty. We need to download a new copy of the file."
|
||||
Remove-Item $output
|
||||
}
|
||||
}
|
||||
|
||||
$start_time = Get-Date
|
||||
$wc = New-Object System.Net.WebClient
|
||||
|
@ -122,6 +133,11 @@ function Download-File($url, $output)
|
|||
}
|
||||
|
||||
Write-Output "[downloadtools.Download-File] Download completed. Time taken: $howlong"
|
||||
|
||||
if ( !(test-path $output) -or (Get-Item $output).Length -eq 0)
|
||||
{
|
||||
throw [System.IO.FileNotFoundException] "Failed to download file $output from $url"
|
||||
}
|
||||
}
|
||||
|
||||
function Unzip-File($zipFile, $targetDir)
|
||||
|
@ -252,7 +268,7 @@ function Download-BuildTools
|
|||
$mvnCmd = "$toolsDir\$mvnVer\bin\mvn.cmd"
|
||||
if (!(test-path $mvnCmd))
|
||||
{
|
||||
$url = "http://www.us.apache.org/dist/maven/maven-3/3.3.9/binaries/$mvnVer-bin.tar.gz"
|
||||
$url = "http://$apacheDistServer/dist/maven/maven-3/3.3.9/binaries/$mvnVer-bin.tar.gz"
|
||||
$output="$toolsDir\$mvnVer-bin.tar.gz"
|
||||
Download-File $url $output
|
||||
Untar-File $output $toolsDir
|
||||
|
@ -402,7 +418,7 @@ function Download-RuntimeDependencies
|
|||
$sparkSubmit="$S_HOME\bin\spark-submit.cmd"
|
||||
if (!(test-path $sparkSubmit))
|
||||
{
|
||||
$url = "http://www.us.apache.org/dist/spark/spark-$sparkVersion/spark-$sparkVersion-bin-hadoop$hadoopVersion.tgz"
|
||||
$url = "http://$apacheDistServer/dist/spark/spark-$sparkVersion/spark-$sparkVersion-bin-hadoop$hadoopVersion.tgz"
|
||||
$output = "$toolsDir\spark-$sparkVersion-bin-hadoop$hadoopVersion.tgz"
|
||||
Download-File $url $output
|
||||
Untar-File $output $toolsDir
|
||||
|
|
|
@ -18,7 +18,8 @@ done
|
|||
# setup Hadoop and Spark versions
|
||||
export SPARK_VERSION=2.0.0
|
||||
export HADOOP_VERSION=2.6
|
||||
echo "[run-samples.sh] SPARK_VERSION=$SPARK_VERSION, HADOOP_VERSION=$HADOOP_VERSION"
|
||||
export APACHE_DIST_SERVER=archive.apache.org
|
||||
echo "[run-samples.sh] SPARK_VERSION=$SPARK_VERSION, HADOOP_VERSION=$HADOOP_VERSION, APACHE_DIST_SERVER=$APACHE_DIST_SERVER"
|
||||
|
||||
export FWDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||
|
||||
|
@ -30,7 +31,7 @@ export SPARK=spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION
|
|||
export SPARK_HOME="$TOOLS_DIR/$SPARK"
|
||||
if [ ! -d "$SPARK_HOME" ];
|
||||
then
|
||||
wget "http://www.us.apache.org/dist/spark/spark-$SPARK_VERSION/$SPARK.tgz" -O "$TOOLS_DIR/$SPARK.tgz"
|
||||
wget "http://$APACHE_DIST_SERVER/dist/spark/spark-$SPARK_VERSION/$SPARK.tgz" -O "$TOOLS_DIR/$SPARK.tgz"
|
||||
tar xfz "$TOOLS_DIR/$SPARK.tgz" -C "$TOOLS_DIR"
|
||||
fi
|
||||
export PATH="$SPARK_HOME/bin:$PATH"
|
||||
|
|
|
@ -157,6 +157,7 @@
|
|||
<Compile Include="Sql\SparkSession.cs" />
|
||||
<Compile Include="Sql\SqlContext.cs" />
|
||||
<Compile Include="Sql\Types.cs" />
|
||||
<Compile Include="Sql\UdfRegistration.cs" />
|
||||
<Compile Include="Sql\UserDefinedFunction.cs" />
|
||||
<Compile Include="Streaming\ConstantInputDStream.cs" />
|
||||
<Compile Include="Streaming\DStream.cs" />
|
||||
|
|
|
@ -216,19 +216,19 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
for (int i = 0; i < numUpdates; i++)
|
||||
{
|
||||
var ms = new MemoryStream(SerDe.ReadBytes(ns));
|
||||
KeyValuePair<int, dynamic> update = (KeyValuePair<int, dynamic>)formatter.Deserialize(ms);
|
||||
var update = (Tuple<int, dynamic>)formatter.Deserialize(ms);
|
||||
|
||||
if (Accumulator.accumulatorRegistry.ContainsKey(update.Key))
|
||||
if (Accumulator.accumulatorRegistry.ContainsKey(update.Item1))
|
||||
{
|
||||
Accumulator accumulator = Accumulator.accumulatorRegistry[update.Key];
|
||||
accumulator.GetType().GetMethod("Add").Invoke(accumulator, new object[] { update.Value });
|
||||
Accumulator accumulator = Accumulator.accumulatorRegistry[update.Item1];
|
||||
accumulator.GetType().GetMethod("Add").Invoke(accumulator, new object[] { update.Item2 });
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.Error.WriteLine("WARN: cann't find update.Key: {0} for accumulator, will create a new one", update.Key);
|
||||
Console.Error.WriteLine("WARN: cann't find update.Key: {0} for accumulator, will create a new one", update.Item1);
|
||||
var genericAccumulatorType = typeof(Accumulator<>);
|
||||
var specificAccumulatorType = genericAccumulatorType.MakeGenericType(update.Value.GetType());
|
||||
Activator.CreateInstance(specificAccumulatorType, new object[] { update.Key, update.Value });
|
||||
var specificAccumulatorType = genericAccumulatorType.MakeGenericType(update.Item2.GetType());
|
||||
Activator.CreateInstance(specificAccumulatorType, new object[] { update.Item1, update.Item2 });
|
||||
}
|
||||
}
|
||||
ns.WriteByte((byte)1); // acknowledge byte other than -1
|
||||
|
|
|
@ -18,7 +18,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
{
|
||||
|
||||
/// <summary>
|
||||
/// Sorts this RDD, which is assumed to consist of KeyValuePair pairs.
|
||||
/// Sorts this RDD, which is assumed to consist of Tuple pairs.
|
||||
/// </summary>
|
||||
/// <typeparam name="K"></typeparam>
|
||||
/// <typeparam name="V"></typeparam>
|
||||
|
@ -26,13 +26,13 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="ascending"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, V>> SortByKey<K, V>(this RDD<KeyValuePair<K, V>> self,
|
||||
public static RDD<Tuple<K, V>> SortByKey<K, V>(this RDD<Tuple<K, V>> self,
|
||||
bool ascending = true, int? numPartitions = null)
|
||||
{
|
||||
return SortByKey<K, V, K>(self, ascending, numPartitions, new DefaultSortKeyFuncHelper<K>().Execute);
|
||||
}
|
||||
/// <summary>
|
||||
/// Sorts this RDD, which is assumed to consist of KeyValuePairs. If key is type of string, case is sensitive.
|
||||
/// Sorts this RDD, which is assumed to consist of Tuples. If Item1 is type of string, case is sensitive.
|
||||
/// </summary>
|
||||
/// <typeparam name="K"></typeparam>
|
||||
/// <typeparam name="V"></typeparam>
|
||||
|
@ -40,9 +40,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="self"></param>
|
||||
/// <param name="ascending"></param>
|
||||
/// <param name="numPartitions">Number of partitions. Each partition of the sorted RDD contains a sorted range of the elements.</param>
|
||||
/// <param name="keyFunc">RDD will sort by keyFunc(key) for every key in KeyValuePair. Must not be null.</param>
|
||||
/// <param name="keyFunc">RDD will sort by keyFunc(key) for every Item1 in Tuple. Must not be null.</param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, V>> SortByKey<K, V, U>(this RDD<KeyValuePair<K, V>> self,
|
||||
public static RDD<Tuple<K, V>> SortByKey<K, V, U>(this RDD<Tuple<K, V>> self,
|
||||
bool ascending, int? numPartitions, Func<K, U> keyFunc)
|
||||
{
|
||||
if (keyFunc == null)
|
||||
|
@ -73,7 +73,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/* first compute the boundary of each part via sampling: we want to partition
|
||||
* the key-space into bins such that the bins have roughly the same
|
||||
* number of (key, value) pairs falling into them */
|
||||
U[] samples = self.Sample(false, fraction, 1).Map(kv => kv.Key).Collect().Select(k => keyFunc(k)).ToArray();
|
||||
U[] samples = self.Sample(false, fraction, 1).Map(kv => kv.Item1).Collect().Select(k => keyFunc(k)).ToArray();
|
||||
Array.Sort(samples, StringComparer.Ordinal); // case sensitive if key type is string
|
||||
|
||||
List<U> bounds = new List<U>();
|
||||
|
@ -103,13 +103,13 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="partitionFunc"></param>
|
||||
/// <param name="ascending"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, V>> repartitionAndSortWithinPartitions<K, V>(
|
||||
this RDD<KeyValuePair<K, V>> self,
|
||||
public static RDD<Tuple<K, V>> repartitionAndSortWithinPartitions<K, V>(
|
||||
this RDD<Tuple<K, V>> self,
|
||||
int? numPartitions = null,
|
||||
Func<K, int> partitionFunc = null,
|
||||
bool ascending = true)
|
||||
{
|
||||
return self.MapPartitionsWithIndex<KeyValuePair<K, V>>((pid, iter) => ascending ? iter.OrderBy(kv => kv.Key) : iter.OrderByDescending(kv => kv.Key));
|
||||
return self.MapPartitionsWithIndex<Tuple<K, V>>((pid, iter) => ascending ? iter.OrderBy(kv => kv.Item1) : iter.OrderByDescending(kv => kv.Item1));
|
||||
}
|
||||
|
||||
[Serializable]
|
||||
|
@ -123,22 +123,22 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
this.ascending = ascending;
|
||||
}
|
||||
|
||||
public IEnumerable<KeyValuePair<K, V>> Execute(int pid, IEnumerable<KeyValuePair<K, V>> kvs)
|
||||
public IEnumerable<Tuple<K, V>> Execute(int pid, IEnumerable<Tuple<K, V>> kvs)
|
||||
{
|
||||
IEnumerable<KeyValuePair<K, V>> ordered;
|
||||
IEnumerable<Tuple<K, V>> ordered;
|
||||
if (ascending)
|
||||
{
|
||||
if (typeof(K) == typeof(string))
|
||||
ordered = kvs.OrderBy(k => func(k.Key).ToString(), StringComparer.Ordinal);
|
||||
ordered = kvs.OrderBy(k => func(k.Item1).ToString(), StringComparer.Ordinal);
|
||||
else
|
||||
ordered = kvs.OrderBy(k => func(k.Key));
|
||||
ordered = kvs.OrderBy(k => func(k.Item1));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (typeof(K) == typeof(string))
|
||||
ordered = kvs.OrderByDescending(k => func(k.Key).ToString(), StringComparer.Ordinal);
|
||||
ordered = kvs.OrderByDescending(k => func(k.Item1).ToString(), StringComparer.Ordinal);
|
||||
else
|
||||
ordered = kvs.OrderByDescending(k => func(k.Key));
|
||||
ordered = kvs.OrderByDescending(k => func(k.Item1));
|
||||
}
|
||||
return ordered;
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@ using Microsoft.Spark.CSharp.Interop.Ipc;
|
|||
namespace Microsoft.Spark.CSharp.Core
|
||||
{
|
||||
/// <summary>
|
||||
/// operations only available to KeyValuePair RDD
|
||||
/// operations only available to Tuple RDD
|
||||
///
|
||||
/// See also http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions
|
||||
/// </summary>
|
||||
|
@ -22,7 +22,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <summary>
|
||||
/// Return the key-value pairs in this RDD to the master as a dictionary.
|
||||
///
|
||||
/// var m = sc.Parallelize(new[] { new KeyValuePair<int, int>(1, 2), new KeyValuePair<int, int>(3, 4) }, 1).CollectAsMap()
|
||||
/// var m = sc.Parallelize(new[] { new Tuple<int, int>(1, 2), new Tuple<int, int>(3, 4) }, 1).CollectAsMap()
|
||||
/// m[1]
|
||||
/// 2
|
||||
/// m[3]
|
||||
|
@ -33,30 +33,30 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <typeparam name="V"></typeparam>
|
||||
/// <param name="self"></param>
|
||||
/// <returns></returns>
|
||||
public static Dictionary<K, V> CollectAsMap<K, V>(this RDD<KeyValuePair<K, V>> self)
|
||||
public static IDictionary<K, V> CollectAsMap<K, V>(this RDD<Tuple<K, V>> self)
|
||||
{
|
||||
return self.Collect().ToDictionary(kv => kv.Key, kv => kv.Value);
|
||||
return self.Collect().ToDictionary(kv => kv.Item1, kv => kv.Item2);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Return an RDD with the keys of each tuple.
|
||||
///
|
||||
/// >>> m = sc.Parallelize(new[] { new KeyValuePair<int, int>(1, 2), new KeyValuePair<int, int>(3, 4) }, 1).Keys().Collect()
|
||||
/// >>> m = sc.Parallelize(new[] { new Tuple<int, int>(1, 2), new Tuple<int, int>(3, 4) }, 1).Keys().Collect()
|
||||
/// [1, 3]
|
||||
/// </summary>
|
||||
/// <typeparam name="K"></typeparam>
|
||||
/// <typeparam name="V"></typeparam>
|
||||
/// <param name="self"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<K> Keys<K, V>(this RDD<KeyValuePair<K, V>> self)
|
||||
public static RDD<K> Keys<K, V>(this RDD<Tuple<K, V>> self)
|
||||
{
|
||||
return self.Map<K>(kv => kv.Key);
|
||||
return self.Map<K>(kv => kv.Item1);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Return an RDD with the values of each tuple.
|
||||
///
|
||||
/// >>> m = sc.Parallelize(new[] { new KeyValuePair<int, int>(1, 2), new KeyValuePair<int, int>(3, 4) }, 1).Values().Collect()
|
||||
/// >>> m = sc.Parallelize(new[] { new Tuple<int, int>(1, 2), new Tuple<int, int>(3, 4) }, 1).Values().Collect()
|
||||
/// [2, 4]
|
||||
///
|
||||
/// </summary>
|
||||
|
@ -64,9 +64,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <typeparam name="V"></typeparam>
|
||||
/// <param name="self"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<V> Values<K, V>(this RDD<KeyValuePair<K, V>> self)
|
||||
public static RDD<V> Values<K, V>(this RDD<Tuple<K, V>> self)
|
||||
{
|
||||
return self.Map<V>(kv => kv.Value);
|
||||
return self.Map<V>(kv => kv.Item2);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -80,9 +80,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
///
|
||||
/// sc.Parallelize(new[]
|
||||
/// {
|
||||
/// new KeyValuePair<string, int>("a", 1),
|
||||
/// new KeyValuePair<string, int>("b", 1),
|
||||
/// new KeyValuePair<string, int>("a", 1)
|
||||
/// new Tuple<string, int>("a", 1),
|
||||
/// new Tuple<string, int>("b", 1),
|
||||
/// new Tuple<string, int>("a", 1)
|
||||
/// }, 2)
|
||||
/// .ReduceByKey((x, y) => x + y).Collect()
|
||||
///
|
||||
|
@ -95,9 +95,13 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="reduceFunc"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, V>> ReduceByKey<K, V>(this RDD<KeyValuePair<K, V>> self, Func<V, V, V> reduceFunc, int numPartitions = 0)
|
||||
public static RDD<Tuple<K, V>> ReduceByKey<K, V>(this RDD<Tuple<K, V>> self, Func<V, V, V> reduceFunc, int numPartitions = 0)
|
||||
{
|
||||
return CombineByKey(self, () => default(V), reduceFunc, reduceFunc, numPartitions);
|
||||
var locallyCombined = self.MapPartitionsWithIndex(new GroupByMergeHelper<K, V>(reduceFunc).Execute, true);
|
||||
|
||||
var shuffled = locallyCombined.PartitionBy(numPartitions);
|
||||
|
||||
return shuffled.MapPartitionsWithIndex(new GroupByMergeHelper<K, V>(reduceFunc).Execute, true);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -109,9 +113,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
///
|
||||
/// sc.Parallelize(new[]
|
||||
/// {
|
||||
/// new KeyValuePair<string, int>("a", 1),
|
||||
/// new KeyValuePair<string, int>("b", 1),
|
||||
/// new KeyValuePair<string, int>("a", 1)
|
||||
/// new Tuple<string, int>("a", 1),
|
||||
/// new Tuple<string, int>("b", 1),
|
||||
/// new Tuple<string, int>("a", 1)
|
||||
/// }, 2)
|
||||
/// .ReduceByKeyLocally((x, y) => x + y).Collect()
|
||||
///
|
||||
|
@ -123,7 +127,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="self"></param>
|
||||
/// <param name="reduceFunc"></param>
|
||||
/// <returns></returns>
|
||||
public static Dictionary<K, V> ReduceByKeyLocally<K, V>(this RDD<KeyValuePair<K, V>> self, Func<V, V, V> reduceFunc)
|
||||
public static IDictionary<K, V> ReduceByKeyLocally<K, V>(this RDD<Tuple<K, V>> self, Func<V, V, V> reduceFunc)
|
||||
{
|
||||
return ReduceByKey(self, reduceFunc).CollectAsMap();
|
||||
}
|
||||
|
@ -133,9 +137,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
///
|
||||
/// sc.Parallelize(new[]
|
||||
/// {
|
||||
/// new KeyValuePair<string, int>("a", 1),
|
||||
/// new KeyValuePair<string, int>("b", 1),
|
||||
/// new KeyValuePair<string, int>("a", 1)
|
||||
/// new Tuple<string, int>("a", 1),
|
||||
/// new Tuple<string, int>("b", 1),
|
||||
/// new Tuple<string, int>("a", 1)
|
||||
/// }, 2)
|
||||
/// .CountByKey((x, y) => x + y).Collect()
|
||||
///
|
||||
|
@ -146,9 +150,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <typeparam name="V"></typeparam>
|
||||
/// <param name="self"></param>
|
||||
/// <returns></returns>
|
||||
public static Dictionary<K, long> CountByKey<K, V>(this RDD<KeyValuePair<K, V>> self)
|
||||
public static IEnumerable<Tuple<K, long>> CountByKey<K, V>(this RDD<Tuple<K, V>> self)
|
||||
{
|
||||
return self.MapValues(v => 1L).ReduceByKey((a, b) => a + b).CollectAsMap();
|
||||
return self.MapValues(v => 1L).ReduceByKey((a, b) => a + b).Collect();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -159,9 +163,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// Performs a hash join across the cluster.
|
||||
///
|
||||
/// var l = sc.Parallelize(
|
||||
/// new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 1);
|
||||
/// new[] { new Tuple<string, int>("a", 1), new Tuple<string, int>("b", 4) }, 1);
|
||||
/// var r = sc.Parallelize(
|
||||
/// new[] { new KeyValuePair<string, int>("a", 2), new KeyValuePair<string, int>("a", 3) }, 1);
|
||||
/// new[] { new Tuple<string, int>("a", 2), new Tuple<string, int>("a", 3) }, 1);
|
||||
/// var m = l.Join(r, 2).Collect();
|
||||
///
|
||||
/// [('a', (1, 2)), ('a', (1, 3))]
|
||||
|
@ -174,9 +178,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="other"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, Tuple<V, W>>> Join<K, V, W>(
|
||||
this RDD<KeyValuePair<K, V>> self,
|
||||
RDD<KeyValuePair<K, W>> other,
|
||||
public static RDD<Tuple<K, Tuple<V, W>>> Join<K, V, W>(
|
||||
this RDD<Tuple<K, V>> self,
|
||||
RDD<Tuple<K, W>> other,
|
||||
int numPartitions = 0)
|
||||
{
|
||||
return self.GroupWith(other, numPartitions).FlatMapValues(
|
||||
|
@ -194,9 +198,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// Hash-partitions the resulting RDD into the given number of partitions.
|
||||
///
|
||||
/// var l = sc.Parallelize(
|
||||
/// new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 1);
|
||||
/// new[] { new Tuple<string, int>("a", 1), new Tuple<string, int>("b", 4) }, 1);
|
||||
/// var r = sc.Parallelize(
|
||||
/// new[] { new KeyValuePair<string, int>("a", 2) }, 1);
|
||||
/// new[] { new Tuple<string, int>("a", 2) }, 1);
|
||||
/// var m = l.LeftOuterJoin(r).Collect();
|
||||
///
|
||||
/// [('a', (1, 2)), ('b', (4, Option))]
|
||||
|
@ -209,9 +213,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="other"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, Tuple<V, Option<W>>>> LeftOuterJoin<K, V, W>(
|
||||
this RDD<KeyValuePair<K, V>> self,
|
||||
RDD<KeyValuePair<K, W>> other,
|
||||
public static RDD<Tuple<K, Tuple<V, Option<W>>>> LeftOuterJoin<K, V, W>(
|
||||
this RDD<Tuple<K, V>> self,
|
||||
RDD<Tuple<K, W>> other,
|
||||
int numPartitions = 0)
|
||||
{
|
||||
return self.GroupWith(other, numPartitions).FlatMapValues(
|
||||
|
@ -228,9 +232,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// Hash-partitions the resulting RDD into the given number of partitions.
|
||||
///
|
||||
/// var l = sc.Parallelize(
|
||||
/// new[] { new KeyValuePair<string, int>("a", 2) }, 1);
|
||||
/// new[] { new Tuple<string, int>("a", 2) }, 1);
|
||||
/// var r = sc.Parallelize(
|
||||
/// new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 1);
|
||||
/// new[] { new Tuple<string, int>("a", 1), new Tuple<string, int>("b", 4) }, 1);
|
||||
/// var m = l.RightOuterJoin(r).Collect();
|
||||
///
|
||||
/// [('a', (2, 1)), ('b', (Option, 4))]
|
||||
|
@ -243,9 +247,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="other"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, Tuple<Option<V>, W>>> RightOuterJoin<K, V, W>(
|
||||
this RDD<KeyValuePair<K, V>> self,
|
||||
RDD<KeyValuePair<K, W>> other,
|
||||
public static RDD<Tuple<K, Tuple<Option<V>, W>>> RightOuterJoin<K, V, W>(
|
||||
this RDD<Tuple<K, V>> self,
|
||||
RDD<Tuple<K, W>> other,
|
||||
int numPartitions = 0)
|
||||
{
|
||||
return self.GroupWith(other, numPartitions).FlatMapValues(
|
||||
|
@ -267,9 +271,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// Hash-partitions the resulting RDD into the given number of partitions.
|
||||
///
|
||||
/// var l = sc.Parallelize(
|
||||
/// new[] { new KeyValuePair<string, int>("a", 1), KeyValuePair<string, int>("b", 4) }, 1);
|
||||
/// new[] { new Tuple<string, int>("a", 1), Tuple<string, int>("b", 4) }, 1);
|
||||
/// var r = sc.Parallelize(
|
||||
/// new[] { new KeyValuePair<string, int>("a", 2), new KeyValuePair<string, int>("c", 8) }, 1);
|
||||
/// new[] { new Tuple<string, int>("a", 2), new Tuple<string, int>("c", 8) }, 1);
|
||||
/// var m = l.FullOuterJoin(r).Collect();
|
||||
///
|
||||
/// [('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))]
|
||||
|
@ -282,9 +286,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="other"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, Tuple<Option<V>, Option<W>>>> FullOuterJoin<K, V, W>(
|
||||
this RDD<KeyValuePair<K, V>> self,
|
||||
RDD<KeyValuePair<K, W>> other,
|
||||
public static RDD<Tuple<K, Tuple<Option<V>, Option<W>>>> FullOuterJoin<K, V, W>(
|
||||
this RDD<Tuple<K, V>> self,
|
||||
RDD<Tuple<K, W>> other,
|
||||
int numPartitions = 0)
|
||||
{
|
||||
return self.GroupWith(other, numPartitions).FlatMapValues(
|
||||
|
@ -295,13 +299,13 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <summary>
|
||||
/// Return a copy of the RDD partitioned using the specified partitioner.
|
||||
///
|
||||
/// sc.Parallelize(new[] { 1, 2, 3, 4, 2, 4, 1 }, 1).Map(x => new KeyValuePair<int, int>(x, x)).PartitionBy(3).Glom().Collect()
|
||||
/// sc.Parallelize(new[] { 1, 2, 3, 4, 2, 4, 1 }, 1).Map(x => new Tuple<int, int>(x, x)).PartitionBy(3).Glom().Collect()
|
||||
/// </summary>
|
||||
/// <param name="self"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <param name="partitionFunc"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, V>> PartitionBy<K, V>(this RDD<KeyValuePair<K, V>> self, int numPartitions = 0,
|
||||
public static RDD<Tuple<K, V>> PartitionBy<K, V>(this RDD<Tuple<K, V>> self, int numPartitions = 0,
|
||||
Func<dynamic, int> partitionFunc = null)
|
||||
{
|
||||
if (numPartitions == 0)
|
||||
|
@ -318,7 +322,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
// convert shuffling version of RDD[(Long, Array[Byte])] back to normal RDD[Array[Byte]]
|
||||
// invoking property keyed.RddProxy marks the end of current pipeline RDD after shuffling
|
||||
// and potentially starts next pipeline RDD with defult SerializedMode.Byte
|
||||
var rdd = new RDD<KeyValuePair<K, V>>(self.sparkContext.SparkContextProxy.CreatePairwiseRDD(keyed.RddProxy, numPartitions,
|
||||
var rdd = new RDD<Tuple<K, V>>(self.sparkContext.SparkContextProxy.CreatePairwiseRDD(keyed.RddProxy, numPartitions,
|
||||
GenerateObjectId(partitionFunc)), self.sparkContext);
|
||||
rdd.partitioner = partitioner;
|
||||
|
||||
|
@ -346,9 +350,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// sc.Parallelize(
|
||||
/// new[]
|
||||
/// {
|
||||
/// new KeyValuePair<string, int>("a", 1),
|
||||
/// new KeyValuePair<string, int>("b", 1),
|
||||
/// new KeyValuePair<string, int>("a", 1)
|
||||
/// new Tuple<string, int>("a", 1),
|
||||
/// new Tuple<string, int>("b", 1),
|
||||
/// new Tuple<string, int>("a", 1)
|
||||
/// }, 2)
|
||||
/// .CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
|
||||
///
|
||||
|
@ -363,8 +367,8 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="mergeCombiners"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, C>> CombineByKey<K, V, C>(
|
||||
this RDD<KeyValuePair<K, V>> self,
|
||||
public static RDD<Tuple<K, C>> CombineByKey<K, V, C>(
|
||||
this RDD<Tuple<K, V>> self,
|
||||
Func<C> createCombiner,
|
||||
Func<C, V, C> mergeValue,
|
||||
Func<C, C, C> mergeCombiners,
|
||||
|
@ -389,9 +393,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// sc.Parallelize(
|
||||
/// new[]
|
||||
/// {
|
||||
/// new KeyValuePair<string, int>("a", 1),
|
||||
/// new KeyValuePair<string, int>("b", 1),
|
||||
/// new KeyValuePair<string, int>("a", 1)
|
||||
/// new Tuple<string, int>("a", 1),
|
||||
/// new Tuple<string, int>("b", 1),
|
||||
/// new Tuple<string, int>("a", 1)
|
||||
/// }, 2)
|
||||
/// .CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
|
||||
///
|
||||
|
@ -406,8 +410,8 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="combOp"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, U>> AggregateByKey<K, V, U>(
|
||||
this RDD<KeyValuePair<K, V>> self,
|
||||
public static RDD<Tuple<K, U>> AggregateByKey<K, V, U>(
|
||||
this RDD<Tuple<K, V>> self,
|
||||
Func<U> zeroValue,
|
||||
Func<U, V, U> seqOp,
|
||||
Func<U, U, U> combOp,
|
||||
|
@ -425,9 +429,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// sc.Parallelize(
|
||||
/// new[]
|
||||
/// {
|
||||
/// new KeyValuePair<string, int>("a", 1),
|
||||
/// new KeyValuePair<string, int>("b", 1),
|
||||
/// new KeyValuePair<string, int>("a", 1)
|
||||
/// new Tuple<string, int>("a", 1),
|
||||
/// new Tuple<string, int>("b", 1),
|
||||
/// new Tuple<string, int>("a", 1)
|
||||
/// }, 2)
|
||||
/// .CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
|
||||
///
|
||||
|
@ -440,8 +444,8 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="func"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, V>> FoldByKey<K, V>(
|
||||
this RDD<KeyValuePair<K, V>> self,
|
||||
public static RDD<Tuple<K, V>> FoldByKey<K, V>(
|
||||
this RDD<Tuple<K, V>> self,
|
||||
Func<V> zeroValue,
|
||||
Func<V, V, V> func,
|
||||
int numPartitions = 0)
|
||||
|
@ -460,9 +464,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// sc.Parallelize(
|
||||
/// new[]
|
||||
/// {
|
||||
/// new KeyValuePair<string, int>("a", 1),
|
||||
/// new KeyValuePair<string, int>("b", 1),
|
||||
/// new KeyValuePair<string, int>("a", 1)
|
||||
/// new Tuple<string, int>("a", 1),
|
||||
/// new Tuple<string, int>("b", 1),
|
||||
/// new Tuple<string, int>("a", 1)
|
||||
/// }, 2)
|
||||
/// .GroupByKey().MapValues(l => string.Join(" ", l)).Collect()
|
||||
///
|
||||
|
@ -474,7 +478,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="self"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, List<V>>> GroupByKey<K, V>(this RDD<KeyValuePair<K, V>> self, int numPartitions = 0)
|
||||
public static RDD<Tuple<K, List<V>>> GroupByKey<K, V>(this RDD<Tuple<K, V>> self, int numPartitions = 0)
|
||||
{
|
||||
return CombineByKey(self,
|
||||
() => new List<V>(),
|
||||
|
@ -490,8 +494,8 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// sc.Parallelize(
|
||||
/// new[]
|
||||
/// {
|
||||
/// new KeyValuePair<string, string[]>("a", new[]{"apple", "banana", "lemon"}),
|
||||
/// new KeyValuePair<string, string[]>("b", new[]{"grapes"})
|
||||
/// new Tuple<string, string[]>("a", new[]{"apple", "banana", "lemon"}),
|
||||
/// new Tuple<string, string[]>("b", new[]{"grapes"})
|
||||
/// }, 2)
|
||||
/// .MapValues(x => x.Length).Collect()
|
||||
///
|
||||
|
@ -504,7 +508,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="self"></param>
|
||||
/// <param name="func"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, U>> MapValues<K, V, U>(this RDD<KeyValuePair<K, V>> self, Func<V, U> func)
|
||||
public static RDD<Tuple<K, U>> MapValues<K, V, U>(this RDD<Tuple<K, V>> self, Func<V, U> func)
|
||||
{
|
||||
return self.Map(new MapValuesHelper<K, V, U>(func).Execute, true);
|
||||
}
|
||||
|
@ -516,8 +520,8 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// x = sc.Parallelize(
|
||||
/// new[]
|
||||
/// {
|
||||
/// new KeyValuePair<string, string[]>("a", new[]{"x", "y", "z"}),
|
||||
/// new KeyValuePair<string, string[]>("b", new[]{"p", "r"})
|
||||
/// new Tuple<string, string[]>("a", new[]{"x", "y", "z"}),
|
||||
/// new Tuple<string, string[]>("b", new[]{"p", "r"})
|
||||
/// }, 2)
|
||||
/// .FlatMapValues(x => x).Collect()
|
||||
///
|
||||
|
@ -530,13 +534,13 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="self"></param>
|
||||
/// <param name="func"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, U>> FlatMapValues<K, V, U>(this RDD<KeyValuePair<K, V>> self, Func<V, IEnumerable<U>> func)
|
||||
public static RDD<Tuple<K, U>> FlatMapValues<K, V, U>(this RDD<Tuple<K, V>> self, Func<V, IEnumerable<U>> func)
|
||||
{
|
||||
return self.FlatMap(new FlatMapValuesHelper<K, V, U>(func).Execute, true);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// explicitly convert KeyValuePair<K, V> to KeyValuePair<K, dynamic>
|
||||
/// explicitly convert Tuple<K, V> to Tuple<K, dynamic>
|
||||
/// since they are incompatibles types unlike V to dynamic
|
||||
/// </summary>
|
||||
/// <typeparam name="K"></typeparam>
|
||||
|
@ -546,10 +550,10 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <typeparam name="W3"></typeparam>
|
||||
/// <param name="self"></param>
|
||||
/// <returns></returns>
|
||||
private static RDD<KeyValuePair<K, dynamic>> MapPartitionsWithIndex<K, V, W1, W2, W3>(this RDD<KeyValuePair<K, dynamic>> self)
|
||||
private static RDD<Tuple<K, dynamic>> MapPartitionsWithIndex<K, V, W1, W2, W3>(this RDD<Tuple<K, dynamic>> self)
|
||||
{
|
||||
CSharpWorkerFunc csharpWorkerFunc = new CSharpWorkerFunc(new DynamicTypingWrapper<K, V, W1, W2, W3>().Execute);
|
||||
var pipelinedRDD = new PipelinedRDD<KeyValuePair<K, dynamic>>
|
||||
var pipelinedRDD = new PipelinedRDD<Tuple<K, dynamic>>
|
||||
{
|
||||
workerFunc = csharpWorkerFunc,
|
||||
preservesPartitioning = true,
|
||||
|
@ -568,8 +572,8 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// For each key k in this RDD or <paramref name="other"/>, return a resulting RDD that
|
||||
/// contains a tuple with the list of values for that key in this RDD as well as <paramref name="other"/>.
|
||||
///
|
||||
/// var x = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 2);
|
||||
/// var y = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 2) }, 1);
|
||||
/// var x = sc.Parallelize(new[] { new Tuple<string, int>("a", 1), new Tuple<string, int>("b", 4) }, 2);
|
||||
/// var y = sc.Parallelize(new[] { new Tuple<string, int>("a", 2) }, 1);
|
||||
/// x.GroupWith(y).Collect();
|
||||
///
|
||||
/// [('a', ([1], [2])), ('b', ([4], []))]
|
||||
|
@ -582,16 +586,16 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="other"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, Tuple<List<V>, List<W>>>> GroupWith<K, V, W>(
|
||||
this RDD<KeyValuePair<K, V>> self,
|
||||
RDD<KeyValuePair<K, W>> other,
|
||||
public static RDD<Tuple<K, Tuple<List<V>, List<W>>>> GroupWith<K, V, W>(
|
||||
this RDD<Tuple<K, V>> self,
|
||||
RDD<Tuple<K, W>> other,
|
||||
int numPartitions = 0)
|
||||
{
|
||||
// MapValues, which introduces extra CSharpRDD, is not necessary when union different RDD types
|
||||
if (typeof(V) != typeof(W))
|
||||
{
|
||||
return self.ConvertTo<KeyValuePair<K, dynamic>>()
|
||||
.Union(other.ConvertTo<KeyValuePair<K, dynamic>>())
|
||||
return self.ConvertTo<Tuple<K, dynamic>>()
|
||||
.Union(other.ConvertTo<Tuple<K, dynamic>>())
|
||||
.MapPartitionsWithIndex<K, V, W, W, W>()
|
||||
.CombineByKey(
|
||||
() => new Tuple<List<V>, List<W>>(new List<V>(), new List<W>()),
|
||||
|
@ -610,9 +614,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// var x = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 5), new KeyValuePair<string, int>("b", 6) }, 2);
|
||||
/// var y = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 2);
|
||||
/// var z = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 2) }, 1);
|
||||
/// var x = sc.Parallelize(new[] { new Tuple<string, int>("a", 5), new Tuple<string, int>("b", 6) }, 2);
|
||||
/// var y = sc.Parallelize(new[] { new Tuple<string, int>("a", 1), new Tuple<string, int>("b", 4) }, 2);
|
||||
/// var z = sc.Parallelize(new[] { new Tuple<string, int>("a", 2) }, 1);
|
||||
/// x.GroupWith(y, z).Collect();
|
||||
/// </summary>
|
||||
/// <typeparam name="K"></typeparam>
|
||||
|
@ -624,18 +628,18 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="other2"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, Tuple<List<V>, List<W1>, List<W2>>>> GroupWith<K, V, W1, W2>(
|
||||
this RDD<KeyValuePair<K, V>> self,
|
||||
RDD<KeyValuePair<K, W1>> other1,
|
||||
RDD<KeyValuePair<K, W2>> other2,
|
||||
public static RDD<Tuple<K, Tuple<List<V>, List<W1>, List<W2>>>> GroupWith<K, V, W1, W2>(
|
||||
this RDD<Tuple<K, V>> self,
|
||||
RDD<Tuple<K, W1>> other1,
|
||||
RDD<Tuple<K, W2>> other2,
|
||||
int numPartitions = 0)
|
||||
{
|
||||
// MapValues, which introduces extra CSharpRDD, is not necessary when union different RDD types
|
||||
if (!(typeof(V) == typeof(W1) && typeof(V) == typeof(W2)))
|
||||
{
|
||||
return self.ConvertTo<KeyValuePair<K, dynamic>>()
|
||||
.Union(other1.ConvertTo<KeyValuePair<K, dynamic>>())
|
||||
.Union(other2.ConvertTo<KeyValuePair<K, dynamic>>())
|
||||
return self.ConvertTo<Tuple<K, dynamic>>()
|
||||
.Union(other1.ConvertTo<Tuple<K, dynamic>>())
|
||||
.Union(other2.ConvertTo<Tuple<K, dynamic>>())
|
||||
.MapPartitionsWithIndex<K, V, W1, W2, W2>()
|
||||
.CombineByKey(
|
||||
() => new Tuple<List<V>, List<W1>, List<W2>>(new List<V>(), new List<W1>(), new List<W2>()),
|
||||
|
@ -655,10 +659,10 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// var x = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 5), new KeyValuePair<string, int>("b", 6) }, 2);
|
||||
/// var y = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 2);
|
||||
/// var z = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 2) }, 1);
|
||||
/// var w = sc.Parallelize(new[] { new KeyValuePair<string, int>("b", 42) }, 1);
|
||||
/// var x = sc.Parallelize(new[] { new Tuple<string, int>("a", 5), new Tuple<string, int>("b", 6) }, 2);
|
||||
/// var y = sc.Parallelize(new[] { new Tuple<string, int>("a", 1), new Tuple<string, int>("b", 4) }, 2);
|
||||
/// var z = sc.Parallelize(new[] { new Tuple<string, int>("a", 2) }, 1);
|
||||
/// var w = sc.Parallelize(new[] { new Tuple<string, int>("b", 42) }, 1);
|
||||
/// var m = x.GroupWith(y, z, w).MapValues(l => string.Join(" ", l.Item1) + " : " + string.Join(" ", l.Item2) + " : " + string.Join(" ", l.Item3) + " : " + string.Join(" ", l.Item4)).Collect();
|
||||
/// </summary>
|
||||
/// <typeparam name="K"></typeparam>
|
||||
|
@ -672,20 +676,20 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="other3"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, Tuple<List<V>, List<W1>, List<W2>, List<W3>>>> GroupWith<K, V, W1, W2, W3>(
|
||||
this RDD<KeyValuePair<K, V>> self,
|
||||
RDD<KeyValuePair<K, W1>> other1,
|
||||
RDD<KeyValuePair<K, W2>> other2,
|
||||
RDD<KeyValuePair<K, W3>> other3,
|
||||
public static RDD<Tuple<K, Tuple<List<V>, List<W1>, List<W2>, List<W3>>>> GroupWith<K, V, W1, W2, W3>(
|
||||
this RDD<Tuple<K, V>> self,
|
||||
RDD<Tuple<K, W1>> other1,
|
||||
RDD<Tuple<K, W2>> other2,
|
||||
RDD<Tuple<K, W3>> other3,
|
||||
int numPartitions = 0)
|
||||
{
|
||||
// MapValues, which introduces extra CSharpRDD, is not necessary when union different RDD types
|
||||
if (!(typeof(V) == typeof(W1) && typeof(V) == typeof(W2)))
|
||||
{
|
||||
return self.ConvertTo<KeyValuePair<K, dynamic>>()
|
||||
.Union(other1.ConvertTo<KeyValuePair<K, dynamic>>())
|
||||
.Union(other2.ConvertTo<KeyValuePair<K, dynamic>>())
|
||||
.Union(other3.ConvertTo<KeyValuePair<K, dynamic>>())
|
||||
return self.ConvertTo<Tuple<K, dynamic>>()
|
||||
.Union(other1.ConvertTo<Tuple<K, dynamic>>())
|
||||
.Union(other2.ConvertTo<Tuple<K, dynamic>>())
|
||||
.Union(other3.ConvertTo<Tuple<K, dynamic>>())
|
||||
.MapPartitionsWithIndex<K, V, W1, W2, W3>()
|
||||
.CombineByKey(
|
||||
() => new Tuple<List<V>, List<W1>, List<W2>, List<W3>>(new List<V>(), new List<W1>(), new List<W2>(), new List<W3>()),
|
||||
|
@ -713,7 +717,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
// ///
|
||||
// /// var fractions = new <see cref="Dictionary{string, double}"/> { { "a", 0.2 }, { "b", 0.1 } };
|
||||
// /// var rdd = sc.Parallelize(fractions.Keys.ToArray(), 2).Cartesian(sc.Parallelize(Enumerable.Range(0, 1000), 2));
|
||||
// /// var sample = rdd.Map(t => new KeyValuePair<string, int>(t.Item1, t.Item2)).SampleByKey(false, fractions, 2).GroupByKey().Collect();
|
||||
// /// var sample = rdd.Map(t => new Tuple<string, int>(t.Item1, t.Item2)).SampleByKey(false, fractions, 2).GroupByKey().Collect();
|
||||
// ///
|
||||
// /// 100 < sample["a"].Length < 300 and 50 < sample["b"].Length < 150
|
||||
// /// true
|
||||
|
@ -730,8 +734,8 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
// /// <param name="fractions"></param>
|
||||
// /// <param name="seed"></param>
|
||||
// /// <returns></returns>
|
||||
//public static RDD<KeyValuePair<string, V>> SampleByKey<V>(
|
||||
// this RDD<KeyValuePair<string, V>> self,
|
||||
//public static RDD<Tuple<string, V>> SampleByKey<V>(
|
||||
// this RDD<Tuple<string, V>> self,
|
||||
// bool withReplacement,
|
||||
// Dictionary<string, double> fractions,
|
||||
// long seed)
|
||||
|
@ -739,14 +743,14 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
// if (fractions.Any(f => f.Value < 0.0))
|
||||
// throw new ArgumentException(string.Format("Negative fraction value found in: {0}", string.Join(",", fractions.Values.ToArray())));
|
||||
|
||||
// return new RDD<KeyValuePair<string, V>>(self.RddProxy.SampleByKey(withReplacement, fractions, seed), self.sparkContext);
|
||||
// return new RDD<Tuple<string, V>>(self.RddProxy.SampleByKey(withReplacement, fractions, seed), self.sparkContext);
|
||||
//}
|
||||
|
||||
/// <summary>
|
||||
/// Return each (key, value) pair in this RDD that has no pair with matching key in <paramref name="other"/>.
|
||||
///
|
||||
/// var x = sc.Parallelize(new[] { new KeyValuePair<string, int?>("a", 1), new KeyValuePair<string, int?>("b", 4), new KeyValuePair<string, int?>("b", 5), new KeyValuePair<string, int?>("a", 2) }, 2);
|
||||
/// var y = sc.Parallelize(new[] { new KeyValuePair<string, int?>("a", 3), new KeyValuePair<string, int?>("c", null) }, 2);
|
||||
/// var x = sc.Parallelize(new[] { new Tuple<string, int?>("a", 1), new Tuple<string, int?>("b", 4), new Tuple<string, int?>("b", 5), new Tuple<string, int?>("a", 2) }, 2);
|
||||
/// var y = sc.Parallelize(new[] { new Tuple<string, int?>("a", 3), new Tuple<string, int?>("c", null) }, 2);
|
||||
/// x.SubtractByKey(y).Collect();
|
||||
///
|
||||
/// [('b', 4), ('b', 5)]
|
||||
|
@ -759,7 +763,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="other"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, V>> SubtractByKey<K, V, W>(this RDD<KeyValuePair<K, V>> self, RDD<KeyValuePair<K, W>> other, int numPartitions = 0)
|
||||
public static RDD<Tuple<K, V>> SubtractByKey<K, V, W>(this RDD<Tuple<K, V>> self, RDD<Tuple<K, W>> other, int numPartitions = 0)
|
||||
{
|
||||
return self.GroupWith(other, numPartitions).FlatMapValues(t => t.Item1.Where(v => t.Item2.Count == 0));
|
||||
}
|
||||
|
@ -770,7 +774,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// searching the partition that the key maps to.
|
||||
///
|
||||
/// >>> l = range(1000)
|
||||
/// >>> rdd = sc.Parallelize(Enumerable.Range(0, 1000).Zip(Enumerable.Range(0, 1000), (x, y) => new KeyValuePair<int, int>(x, y)), 10)
|
||||
/// >>> rdd = sc.Parallelize(Enumerable.Range(0, 1000).Zip(Enumerable.Range(0, 1000), (x, y) => new Tuple<int, int>(x, y)), 10)
|
||||
/// >>> rdd.lookup(42)
|
||||
/// [42]
|
||||
///
|
||||
|
@ -780,7 +784,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="self"></param>
|
||||
/// <param name="key"></param>
|
||||
/// <returns></returns>
|
||||
public static V[] Lookup<K, V>(this RDD<KeyValuePair<K, V>> self, K key)
|
||||
public static V[] Lookup<K, V>(this RDD<Tuple<K, V>> self, K key)
|
||||
{
|
||||
return self.Filter(new LookupHelper<K, V>(key).Execute).Values().Collect();
|
||||
}
|
||||
|
@ -795,7 +799,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <typeparam name="V"></typeparam>
|
||||
/// <param name="self"></param>
|
||||
/// <param name="conf">Hadoop job configuration, passed in as a dict</param>
|
||||
public static void SaveAsNewAPIHadoopDataset<K, V>(this RDD<KeyValuePair<K, V>> self, IEnumerable<KeyValuePair<string, string>> conf)
|
||||
public static void SaveAsNewAPIHadoopDataset<K, V>(this RDD<Tuple<K, V>> self, IEnumerable<Tuple<string, string>> conf)
|
||||
{
|
||||
self.RddProxy.SaveAsNewAPIHadoopDataset(conf);
|
||||
}
|
||||
|
@ -811,7 +815,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="keyClass">fully qualified classname of key Writable class (e.g. "org.apache.hadoop.io.IntWritable", None by default)</param>
|
||||
/// <param name="valueClass">fully qualified classname of value Writable class (e.g. "org.apache.hadoop.io.Text", None by default)</param>
|
||||
/// <param name="conf">Hadoop job configuration, passed in as a dict (None by default)</param>
|
||||
public static void SaveAsNewAPIHadoopFile<K, V>(this RDD<KeyValuePair<K, V>> self, string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf)
|
||||
public static void SaveAsNewAPIHadoopFile<K, V>(this RDD<Tuple<K, V>> self, string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<Tuple<string, string>> conf)
|
||||
{
|
||||
self.RddProxy.SaveAsNewAPIHadoopFile(path, outputFormatClass, keyClass, valueClass, conf);
|
||||
}
|
||||
|
@ -826,7 +830,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <typeparam name="V"></typeparam>
|
||||
/// <param name="self"></param>
|
||||
/// <param name="conf">Hadoop job configuration, passed in as a dict</param>
|
||||
public static void SaveAsHadoopDataset<K, V>(this RDD<KeyValuePair<K, V>> self, IEnumerable<KeyValuePair<string, string>> conf)
|
||||
public static void SaveAsHadoopDataset<K, V>(this RDD<Tuple<K, V>> self, IEnumerable<Tuple<string, string>> conf)
|
||||
{
|
||||
self.RddProxy.SaveAsHadoopDataset(conf);
|
||||
}
|
||||
|
@ -848,7 +852,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="valueClass">fully qualified classname of value Writable class (e.g. "org.apache.hadoop.io.Text", None by default)</param>
|
||||
/// <param name="conf">(None by default)</param>
|
||||
/// <param name="compressionCodecClass">(None by default)</param>
|
||||
public static void SaveAsHadoopFile<K, V>(this RDD<KeyValuePair<K, V>> self, string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf, string compressionCodecClass)
|
||||
public static void SaveAsHadoopFile<K, V>(this RDD<Tuple<K, V>> self, string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<Tuple<string, string>> conf, string compressionCodecClass)
|
||||
{
|
||||
self.RddProxy.SaveAsHadoopFile(path, outputFormatClass, keyClass, valueClass, conf, compressionCodecClass);
|
||||
}
|
||||
|
@ -867,7 +871,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="self"></param>
|
||||
/// <param name="path">path to sequence file</param>
|
||||
/// <param name="compressionCodecClass">(None by default)</param>
|
||||
public static void SaveAsSequenceFile<K, V>(this RDD<KeyValuePair<K, V>> self, string path, string compressionCodecClass)
|
||||
public static void SaveAsSequenceFile<K, V>(this RDD<Tuple<K, V>> self, string path, string compressionCodecClass)
|
||||
{
|
||||
self.RddProxy.SaveAsSequenceFile(path, compressionCodecClass);
|
||||
}
|
||||
|
@ -887,12 +891,12 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
mergeCombiners = mc;
|
||||
}
|
||||
|
||||
public IEnumerable<KeyValuePair<K, C>> Execute(int pid, IEnumerable<KeyValuePair<K, C>> input)
|
||||
public IEnumerable<Tuple<K, C>> Execute(int pid, IEnumerable<Tuple<K, C>> input)
|
||||
{
|
||||
return input.GroupBy(
|
||||
kvp => kvp.Key,
|
||||
kvp => kvp.Value,
|
||||
(k, v) => new KeyValuePair<K, C>(k, v.Aggregate(mergeCombiners))
|
||||
kvp => kvp.Item1,
|
||||
kvp => kvp.Item2,
|
||||
(k, v) => new Tuple<K, C>(k, v.Aggregate(mergeCombiners))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -908,12 +912,12 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
this.mergeValue = mergeValue;
|
||||
}
|
||||
|
||||
public IEnumerable<KeyValuePair<K, C>> Execute(int pid, IEnumerable<KeyValuePair<K, V>> input)
|
||||
public IEnumerable<Tuple<K, C>> Execute(int pid, IEnumerable<Tuple<K, V>> input)
|
||||
{
|
||||
return input.GroupBy(
|
||||
kvp => kvp.Key,
|
||||
kvp => kvp.Value,
|
||||
(k, v) => new KeyValuePair<K, C>(k, v.Aggregate(createCombiner(), mergeValue))
|
||||
kvp => kvp.Item1,
|
||||
kvp => kvp.Item2,
|
||||
(k, v) => new Tuple<K, C>(k, v.Aggregate(createCombiner(), mergeValue))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -932,7 +936,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
this.partitionFunc = partitionFunc;
|
||||
}
|
||||
|
||||
public IEnumerable<byte[]> Execute(int split, IEnumerable<KeyValuePair<K, V>> input)
|
||||
public IEnumerable<byte[]> Execute(int split, IEnumerable<Tuple<K, V>> input)
|
||||
{
|
||||
// make sure that md5 is not null even if it is deseriazed in C# worker
|
||||
if (md5 == null)
|
||||
|
@ -945,12 +949,12 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
var ms = new MemoryStream();
|
||||
if (partitionFunc == null)
|
||||
{
|
||||
formatter.Serialize(ms, kv.Key);
|
||||
formatter.Serialize(ms, kv.Item1);
|
||||
yield return md5.ComputeHash(ms.ToArray()).Take(8).ToArray();
|
||||
}
|
||||
else
|
||||
{
|
||||
long pid = (long)(partitionFunc(kv.Key) % numPartitions);
|
||||
long pid = (long)(partitionFunc(kv.Item1) % numPartitions);
|
||||
yield return SerDe.ToBytes(pid);
|
||||
}
|
||||
ms = new MemoryStream();
|
||||
|
@ -969,12 +973,12 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
func = f;
|
||||
}
|
||||
|
||||
public KeyValuePair<K, U> Execute(KeyValuePair<K, V> kvp)
|
||||
public Tuple<K, U> Execute(Tuple<K, V> kvp)
|
||||
{
|
||||
return new KeyValuePair<K, U>
|
||||
return new Tuple<K, U>
|
||||
(
|
||||
kvp.Key,
|
||||
func(kvp.Value)
|
||||
kvp.Item1,
|
||||
func(kvp.Item2)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -988,9 +992,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
func = f;
|
||||
}
|
||||
|
||||
public IEnumerable<KeyValuePair<K, U>> Execute(KeyValuePair<K, V> kvp)
|
||||
public IEnumerable<Tuple<K, U>> Execute(Tuple<K, V> kvp)
|
||||
{
|
||||
return func(kvp.Value).Select(v => new KeyValuePair<K, U>(kvp.Key, v));
|
||||
return func(kvp.Item2).Select(v => new Tuple<K, U>(kvp.Item1, v));
|
||||
}
|
||||
}
|
||||
[Serializable]
|
||||
|
@ -1001,9 +1005,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
{
|
||||
this.key = key;
|
||||
}
|
||||
internal bool Execute(KeyValuePair<K, V> input)
|
||||
internal bool Execute(Tuple<K, V> input)
|
||||
{
|
||||
return input.Key.ToString() == key.ToString();
|
||||
return input.Item1.ToString() == key.ToString();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -51,6 +51,17 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Return the SparkContext that created this RDD
|
||||
/// </summary>
|
||||
public SparkContext SparkContext
|
||||
{
|
||||
get
|
||||
{
|
||||
return sparkContext;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Return whether this RDD has been cached or not
|
||||
/// </summary>
|
||||
|
@ -189,7 +200,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <summary>
|
||||
/// Return a new RDD by applying a function to each element of this RDD.
|
||||
///
|
||||
/// sc.Parallelize(new string[]{"b", "a", "c"}, 1).Map(x => new KeyValuePair<string, int>(x, 1)).Collect()
|
||||
/// sc.Parallelize(new string[]{"b", "a", "c"}, 1).Map(x => new Tuple<string, int>(x, 1)).Collect()
|
||||
/// [('a', 1), ('b', 1), ('c', 1)]
|
||||
///
|
||||
/// </summary>
|
||||
|
@ -288,7 +299,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <returns></returns>
|
||||
public RDD<T> Distinct(int numPartitions = 0)
|
||||
{
|
||||
return Map(x => new KeyValuePair<T, int>(x, 0)).ReduceByKey((x, y) => x, numPartitions).Map<T>(x => x.Key);
|
||||
return Map(x => new Tuple<T, int>(x, 0)).ReduceByKey((x, y) => x, numPartitions).Map<T>(x => x.Item1);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -461,9 +472,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <returns></returns>
|
||||
public RDD<T> Intersection(RDD<T> other)
|
||||
{
|
||||
return Map(v => new KeyValuePair<T, int>(v, 0))
|
||||
.GroupWith(other.Map(v => new KeyValuePair<T, int>(v, 0)))
|
||||
.Filter(kv => kv.Value.Item1.Count > 0 && kv.Value.Item2.Count > 0)
|
||||
return Map(v => new Tuple<T, int>(v, 0))
|
||||
.GroupWith(other.Map(v => new Tuple<T, int>(v, 0)))
|
||||
.Filter(kv => kv.Item2.Item1.Count > 0 && kv.Item2.Item2.Count > 0)
|
||||
.Keys();
|
||||
}
|
||||
|
||||
|
@ -533,7 +544,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
///
|
||||
/// </summary>
|
||||
/// <returns></returns>
|
||||
public RDD<KeyValuePair<K, List<T>>> GroupBy<K>(Func<T, K> f, int numPartitions = 0)
|
||||
public RDD<Tuple<K, List<T>>> GroupBy<K>(Func<T, K> f, int numPartitions = 0)
|
||||
{
|
||||
return KeyBy(f).GroupByKey(numPartitions);
|
||||
}
|
||||
|
@ -639,14 +650,14 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
if (depth < 1)
|
||||
throw new ArgumentException(string.Format("Depth cannot be smaller than 1 but got {0}.", depth));
|
||||
|
||||
var zeroValue = new KeyValuePair<T, bool>(default(T), true); // Use the second entry to indicate whether this is a dummy value.
|
||||
var zeroValue = new Tuple<T, bool>(default(T), true); // Use the second entry to indicate whether this is a dummy value.
|
||||
|
||||
Func<KeyValuePair<T, bool>, KeyValuePair<T, bool>, KeyValuePair<T, bool>> op = new TreeReduceHelper<T>(f).Execute;
|
||||
Func<Tuple<T, bool>, Tuple<T, bool>, Tuple<T, bool>> op = new TreeReduceHelper<T>(f).Execute;
|
||||
|
||||
var reduced = Map<KeyValuePair<T, bool>>(x => new KeyValuePair<T, bool>(x, false)).TreeAggregate(zeroValue, op, op, depth);
|
||||
if (reduced.Value)
|
||||
var reduced = Map<Tuple<T, bool>>(x => new Tuple<T, bool>(x, false)).TreeAggregate(zeroValue, op, op, depth);
|
||||
if (reduced.Item2)
|
||||
throw new ArgumentException("Cannot reduce empty RDD.");
|
||||
return reduced.Key;
|
||||
return reduced.Item1;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -736,7 +747,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
numPartitions /= scale;
|
||||
|
||||
partiallyAggregated = partiallyAggregated
|
||||
.MapPartitionsWithIndex<KeyValuePair<int, U>>(new TreeAggregateHelper<U>(numPartitions).Execute)
|
||||
.MapPartitionsWithIndex<Tuple<int, U>>(new TreeAggregateHelper<U>(numPartitions).Execute)
|
||||
.ReduceByKey(combOp, numPartitions)
|
||||
.Values();
|
||||
}
|
||||
|
@ -762,9 +773,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
///
|
||||
/// </summary>
|
||||
/// <returns></returns>
|
||||
public Dictionary<T, long> CountByValue()
|
||||
public IEnumerable<Tuple<T, long>> CountByValue()
|
||||
{
|
||||
return Map<KeyValuePair<T, T>>(v => new KeyValuePair<T, T>(v, default(T))).CountByKey();
|
||||
return Map(v => new Tuple<T, T>(v, default(T))).CountByKey();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -872,9 +883,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <returns></returns>
|
||||
public RDD<T> Subtract(RDD<T> other, int numPartitions = 0)
|
||||
{
|
||||
return Map<KeyValuePair<T, T>>(v => new KeyValuePair<T, T>(v, default(T))).SubtractByKey
|
||||
return Map<Tuple<T, T>>(v => new Tuple<T, T>(v, default(T))).SubtractByKey
|
||||
(
|
||||
other.Map<KeyValuePair<T, T>>(v => new KeyValuePair<T, T>(v, default(T))),
|
||||
other.Map<Tuple<T, T>>(v => new Tuple<T, T>(v, default(T))),
|
||||
numPartitions
|
||||
)
|
||||
.Keys();
|
||||
|
@ -890,9 +901,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <typeparam name="K"></typeparam>
|
||||
/// <param name="f"></param>
|
||||
/// <returns></returns>
|
||||
public RDD<KeyValuePair<K, T>> KeyBy<K>(Func<T, K> f)
|
||||
public RDD<Tuple<K, T>> KeyBy<K>(Func<T, K> f)
|
||||
{
|
||||
return Map<KeyValuePair<K, T>>(new KeyByHelper<K, T>(f).Execute);
|
||||
return Map<Tuple<K, T>>(new KeyByHelper<K, T>(f).Execute);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -950,9 +961,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <typeparam name="U"></typeparam>
|
||||
/// <param name="other"></param>
|
||||
/// <returns></returns>
|
||||
public RDD<KeyValuePair<T, U>> Zip<U>(RDD<U> other)
|
||||
public RDD<Tuple<T, U>> Zip<U>(RDD<U> other)
|
||||
{
|
||||
return new RDD<KeyValuePair<T, U>>(RddProxy.Zip(other.RddProxy), sparkContext, SerializedMode.Pair);
|
||||
return new RDD<Tuple<T, U>>(RddProxy.Zip(other.RddProxy), sparkContext, SerializedMode.Pair);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -971,7 +982,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
///
|
||||
/// </summary>
|
||||
/// <returns></returns>
|
||||
public RDD<KeyValuePair<T, long>> ZipWithIndex()
|
||||
public RDD<Tuple<T, long>> ZipWithIndex()
|
||||
{
|
||||
int num = GetNumPartitions();
|
||||
int[] starts = new int[num];
|
||||
|
@ -981,7 +992,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
for (int i = 0; i < nums.Length - 1; i++)
|
||||
starts[i + 1] = starts[i] + nums[i];
|
||||
}
|
||||
return MapPartitionsWithIndex<KeyValuePair<T, long>>(new ZipWithIndexHelper<T>(starts).Execute);
|
||||
return MapPartitionsWithIndex<Tuple<T, long>>(new ZipWithIndexHelper<T>(starts).Execute);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -996,10 +1007,10 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
///
|
||||
/// </summary>
|
||||
/// <returns></returns>
|
||||
public RDD<KeyValuePair<T, long>> ZipWithUniqueId()
|
||||
public RDD<Tuple<T, long>> ZipWithUniqueId()
|
||||
{
|
||||
int num = GetNumPartitions();
|
||||
return MapPartitionsWithIndex<KeyValuePair<T, long>>(new ZipWithUniqueIdHelper<T>(num).Execute);
|
||||
return MapPartitionsWithIndex<Tuple<T, long>>(new ZipWithUniqueIdHelper<T>(num).Execute);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -1225,27 +1236,27 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
{
|
||||
K key;
|
||||
dynamic value;
|
||||
if (x is KeyValuePair<K, V>)
|
||||
if (x is Tuple<K, V>)
|
||||
{
|
||||
key = ((KeyValuePair<K, V>)x).Key;
|
||||
value = ((KeyValuePair<K, V>)x).Value;
|
||||
key = ((Tuple<K, V>)x).Item1;
|
||||
value = ((Tuple<K, V>)x).Item2;
|
||||
}
|
||||
else if (x is KeyValuePair<K, W1>)
|
||||
else if (x is Tuple<K, W1>)
|
||||
{
|
||||
key = ((KeyValuePair<K, W1>)x).Key;
|
||||
value = ((KeyValuePair<K, W1>)x).Value;
|
||||
key = ((Tuple<K, W1>)x).Item1;
|
||||
value = ((Tuple<K, W1>)x).Item2;
|
||||
}
|
||||
else if (x is KeyValuePair<K, W2>)
|
||||
else if (x is Tuple<K, W2>)
|
||||
{
|
||||
key = ((KeyValuePair<K, W2>)x).Key;
|
||||
value = ((KeyValuePair<K, W2>)x).Value;
|
||||
key = ((Tuple<K, W2>)x).Item1;
|
||||
value = ((Tuple<K, W2>)x).Item2;
|
||||
}
|
||||
else
|
||||
{
|
||||
key = ((KeyValuePair<K, W3>)x).Key;
|
||||
value = ((KeyValuePair<K, W3>)x).Value;
|
||||
key = ((Tuple<K, W3>)x).Item1;
|
||||
value = ((Tuple<K, W3>)x).Item2;
|
||||
}
|
||||
return new KeyValuePair<K, dynamic>(key, value);
|
||||
return new Tuple<K, dynamic>(key, value);
|
||||
})
|
||||
.Cast<dynamic>();
|
||||
}
|
||||
|
@ -1405,9 +1416,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
func = f;
|
||||
}
|
||||
|
||||
internal KeyValuePair<K, T> Execute(T input)
|
||||
internal Tuple<K, T> Execute(T input)
|
||||
{
|
||||
return new KeyValuePair<K, T>(func(input), input);
|
||||
return new Tuple<K, T>(func(input), input);
|
||||
}
|
||||
}
|
||||
[Serializable]
|
||||
|
@ -1434,9 +1445,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
{
|
||||
this.numPartitions = numPartitions;
|
||||
}
|
||||
internal IEnumerable<KeyValuePair<int, U>> Execute(int pid, IEnumerable<U> input)
|
||||
internal IEnumerable<Tuple<int, U>> Execute(int pid, IEnumerable<U> input)
|
||||
{
|
||||
return input.Select(x => new KeyValuePair<int, U>(pid % numPartitions, x));
|
||||
return input.Select(x => new Tuple<int, U>(pid % numPartitions, x));
|
||||
}
|
||||
}
|
||||
[Serializable]
|
||||
|
@ -1447,14 +1458,14 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
{
|
||||
this.func = func;
|
||||
}
|
||||
internal KeyValuePair<T, bool> Execute(KeyValuePair<T, bool> x, KeyValuePair<T, bool> y)
|
||||
internal Tuple<T, bool> Execute(Tuple<T, bool> x, Tuple<T, bool> y)
|
||||
{
|
||||
if (x.Value)
|
||||
if (x.Item2)
|
||||
return y;
|
||||
else if (y.Value)
|
||||
else if (y.Item2)
|
||||
return x;
|
||||
else
|
||||
return new KeyValuePair<T, bool>(func(x.Key, y.Key), false);
|
||||
return new Tuple<T, bool>(func(x.Item1, y.Item1), false);
|
||||
}
|
||||
}
|
||||
[Serializable]
|
||||
|
@ -1539,12 +1550,12 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
{
|
||||
this.num = num;
|
||||
}
|
||||
internal IEnumerable<KeyValuePair<T, long>> Execute(int pid, IEnumerable<T> input)
|
||||
internal IEnumerable<Tuple<T, long>> Execute(int pid, IEnumerable<T> input)
|
||||
{
|
||||
long l = 0;
|
||||
foreach (var item in input)
|
||||
{
|
||||
yield return new KeyValuePair<T, long>(item, (l++) * num + pid);
|
||||
yield return new Tuple<T, long>(item, (l++) * num + pid);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1556,12 +1567,12 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
{
|
||||
this.starts = starts;
|
||||
}
|
||||
internal IEnumerable<KeyValuePair<T, long>> Execute(int pid, IEnumerable<T> input)
|
||||
internal IEnumerable<Tuple<T, long>> Execute(int pid, IEnumerable<T> input)
|
||||
{
|
||||
long l = 0;
|
||||
foreach (var item in input)
|
||||
{
|
||||
yield return new KeyValuePair<T, long>(item, (l++) + starts[pid]);
|
||||
yield return new Tuple<T, long>(item, (l++) + starts[pid]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -241,7 +241,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
///
|
||||
/// Do
|
||||
/// {{{
|
||||
/// RDD<KeyValuePair<string, string>> rdd = sparkContext.WholeTextFiles("hdfs://a-hdfs-path")
|
||||
/// RDD<Tuple<string, string>> rdd = sparkContext.WholeTextFiles("hdfs://a-hdfs-path")
|
||||
/// }}}
|
||||
///
|
||||
/// then `rdd` contains
|
||||
|
@ -259,9 +259,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="filePath"></param>
|
||||
/// <param name="minPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public RDD<KeyValuePair<byte[], byte[]>> WholeTextFiles(string filePath, int? minPartitions = null)
|
||||
public RDD<Tuple<byte[], byte[]>> WholeTextFiles(string filePath, int? minPartitions = null)
|
||||
{
|
||||
return new RDD<KeyValuePair<byte[], byte[]>>(SparkContextProxy.WholeTextFiles(filePath, minPartitions ?? DefaultMinPartitions), this, SerializedMode.Pair);
|
||||
return new RDD<Tuple<byte[], byte[]>>(SparkContextProxy.WholeTextFiles(filePath, minPartitions ?? DefaultMinPartitions), this, SerializedMode.Pair);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -279,7 +279,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// }}}
|
||||
///
|
||||
/// Do
|
||||
/// RDD<KeyValuePair<string, byte[]>>"/> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`,
|
||||
/// RDD<Tuple<string, byte[]>>"/> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`,
|
||||
///
|
||||
/// then `rdd` contains
|
||||
/// {{{
|
||||
|
@ -296,9 +296,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="filePath"></param>
|
||||
/// <param name="minPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public RDD<KeyValuePair<byte[], byte[]>> BinaryFiles(string filePath, int? minPartitions)
|
||||
public RDD<Tuple<byte[], byte[]>> BinaryFiles(string filePath, int? minPartitions)
|
||||
{
|
||||
return new RDD<KeyValuePair<byte[], byte[]>>(SparkContextProxy.BinaryFiles(filePath, minPartitions ?? DefaultMinPartitions), this, SerializedMode.Pair);
|
||||
return new RDD<Tuple<byte[], byte[]>>(SparkContextProxy.BinaryFiles(filePath, minPartitions ?? DefaultMinPartitions), this, SerializedMode.Pair);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -341,7 +341,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="valueConverterClass">(None by default)</param>
|
||||
/// <param name="conf"> Hadoop configuration, passed in as a dict (None by default)</param>
|
||||
/// <returns></returns>
|
||||
public RDD<byte[]> NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass = null, string valueConverterClass = null, IEnumerable<KeyValuePair<string, string>> conf = null)
|
||||
public RDD<byte[]> NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass = null, string valueConverterClass = null, IEnumerable<Tuple<string, string>> conf = null)
|
||||
{
|
||||
return new RDD<byte[]>(SparkContextProxy.NewAPIHadoopFile(filePath, inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, conf, 1), this, SerializedMode.None);
|
||||
}
|
||||
|
@ -360,7 +360,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="valueConverterClass">(None by default)</param>
|
||||
/// <param name="conf">Hadoop configuration, passed in as a dict (None by default)</param>
|
||||
/// <returns></returns>
|
||||
public RDD<byte[]> NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass = null, string valueConverterClass = null, IEnumerable<KeyValuePair<string, string>> conf = null)
|
||||
public RDD<byte[]> NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass = null, string valueConverterClass = null, IEnumerable<Tuple<string, string>> conf = null)
|
||||
{
|
||||
return new RDD<byte[]>(SparkContextProxy.NewAPIHadoopRDD(inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, conf, 1), this, SerializedMode.None);
|
||||
}
|
||||
|
@ -381,7 +381,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="valueConverterClass">(None by default)</param>
|
||||
/// <param name="conf">Hadoop configuration, passed in as a dict (None by default)</param>
|
||||
/// <returns></returns>
|
||||
public RDD<byte[]> HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass = null, string valueConverterClass = null, IEnumerable<KeyValuePair<string, string>> conf = null)
|
||||
public RDD<byte[]> HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass = null, string valueConverterClass = null, IEnumerable<Tuple<string, string>> conf = null)
|
||||
{
|
||||
return new RDD<byte[]>(SparkContextProxy.HadoopFile(filePath, inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, conf, 1), this, SerializedMode.None);
|
||||
}
|
||||
|
@ -400,7 +400,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="valueConverterClass">(None by default)</param>
|
||||
/// <param name="conf">Hadoop configuration, passed in as a dict (None by default)</param>
|
||||
/// <returns></returns>
|
||||
public RDD<byte[]> HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass = null, string valueConverterClass = null, IEnumerable<KeyValuePair<string, string>> conf = null)
|
||||
public RDD<byte[]> HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass = null, string valueConverterClass = null, IEnumerable<Tuple<string, string>> conf = null)
|
||||
{
|
||||
return new RDD<byte[]>(SparkContextProxy.HadoopRDD(inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, conf, 1), this, SerializedMode.None);
|
||||
}
|
||||
|
@ -571,6 +571,17 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
SparkContextProxy.SetLogLevel(logLevel);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Run a job on a given set of partitions of an RDD.
|
||||
/// </summary>
|
||||
/// <typeparam name="T"></typeparam>
|
||||
/// <param name="rdd"></param>
|
||||
/// <param name="partitions"></param>
|
||||
public void RunJob<T>(RDD<T> rdd, IEnumerable<int> partitions)
|
||||
{
|
||||
SparkContextProxy.RunJob(rdd.RddProxy, partitions);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Cancel active jobs for the specified group. See <see cref="SetJobGroup"/> for more information.
|
||||
/// </summary>
|
||||
|
|
|
@ -7,6 +7,7 @@ using System.Linq;
|
|||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Spark.CSharp.Proxy.Ipc;
|
||||
using System;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Interop.Ipc
|
||||
{
|
||||
|
@ -16,31 +17,31 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
[ExcludeFromCodeCoverage] //IPC calls to JVM validated using validation-enabled samples - unit test coverage not reqiured
|
||||
internal static class JvmBridgeUtils
|
||||
{
|
||||
public static JvmObjectReference GetJavaMap<K, V>(IEnumerable<KeyValuePair<K, V>> enumerable)
|
||||
public static JvmObjectReference GetJavaMap<K, V>(IEnumerable<Tuple<K, V>> enumerable)
|
||||
{
|
||||
var jmap = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.Hashtable", new object[] { });
|
||||
if (enumerable != null)
|
||||
{
|
||||
foreach (var item in enumerable)
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jmap, "put", new object[] { item.Key, item.Value });
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jmap, "put", new object[] { item.Item1, item.Item2 });
|
||||
}
|
||||
return jmap;
|
||||
}
|
||||
|
||||
public static JvmObjectReference GetJavaHashMap<K, V>(IEnumerable<KeyValuePair<K, V>> enumerable)
|
||||
public static JvmObjectReference GetJavaHashMap<K, V>(IEnumerable<Tuple<K, V>> enumerable)
|
||||
{
|
||||
var jmap = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.HashMap", new object[] { });
|
||||
if (enumerable != null)
|
||||
{
|
||||
foreach (var item in enumerable)
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jmap, "put", new object[] { item.Key, item.Value });
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jmap, "put", new object[] { item.Item1, item.Item2 });
|
||||
}
|
||||
return jmap;
|
||||
}
|
||||
|
||||
public static JvmObjectReference GetScalaMutableMap<K, V>(Dictionary<K, V> mapValues)
|
||||
public static JvmObjectReference GetScalaMutableMap<K, V>(IEnumerable<Tuple<K, V>> mapValues)
|
||||
{
|
||||
var hashMapReference = GetJavaHashMap(mapValues.Select(kvp => kvp));
|
||||
var hashMapReference = GetJavaHashMap(mapValues);
|
||||
return new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.JvmBridgeUtils", "toMutableMap", new object[] { hashMapReference }).ToString());
|
||||
}
|
||||
|
||||
|
|
|
@ -69,8 +69,8 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
int previousReferencesCountBenchmark = referencesCountBenchmark;
|
||||
checkCount *= 2;
|
||||
referencesCountBenchmark = referencesCountBenchmark + referencesCountBenchmark / 2;
|
||||
logger.LogDebug("Adjust checkCount from {0} to {1}, referencesCountBenchmark from {2} to {3}",
|
||||
previousCheckCount, checkCount, previousReferencesCountBenchmark, referencesCountBenchmark);
|
||||
//logger.LogDebug("Adjust checkCount from {0} to {1}, referencesCountBenchmark from {2} to {3}",
|
||||
// previousCheckCount, checkCount, previousReferencesCountBenchmark, referencesCountBenchmark);
|
||||
}
|
||||
return checkCount;
|
||||
}
|
||||
|
@ -134,14 +134,14 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
|
||||
private void RunReleaseObjectLoop()
|
||||
{
|
||||
logger.LogDebug("Checking objects thread start ...");
|
||||
//logger.LogDebug("Checking objects thread start ...");
|
||||
while (shouldKeepRunning)
|
||||
{
|
||||
ReleseGarbageCollectedObjects();
|
||||
Thread.Sleep(CheckInterval);
|
||||
}
|
||||
|
||||
logger.LogDebug("Checking objects thread stopped.");
|
||||
//logger.LogDebug("Checking objects thread stopped.");
|
||||
}
|
||||
|
||||
~WeakObjectManagerImpl()
|
||||
|
@ -165,13 +165,13 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
int referencesCount = weakReferences.Count;
|
||||
if (referencesCount == 0)
|
||||
{
|
||||
logger.LogDebug("check begin : quit as weakReferences.Count = 0");
|
||||
//logger.LogDebug("check begin : quit as weakReferences.Count = 0");
|
||||
return;
|
||||
}
|
||||
|
||||
var beginTime = DateTime.Now;
|
||||
int checkCount = checkCountController.AdjustCheckCount(referencesCount);
|
||||
logger.LogDebug("check begin : weakReferences.Count = {0}, checkCount: {1}", referencesCount, checkCount);
|
||||
//logger.LogDebug("check begin : weakReferences.Count = {0}, checkCount: {1}", referencesCount, checkCount);
|
||||
int garbageCount;
|
||||
var aliveList = ReleseGarbageCollectedObjects(checkCount, out garbageCount);
|
||||
|
||||
|
@ -179,11 +179,11 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
aliveList.ForEach(item => weakReferences.Enqueue(item));
|
||||
var timeStoreAlive = DateTime.Now;
|
||||
|
||||
logger.LogDebug("check end : released {0} garbage, remain {1} alive, used {2} ms : release garbage used {3} ms, store alive used {4} ms",
|
||||
garbageCount, weakReferences.Count, (DateTime.Now - beginTime).TotalMilliseconds,
|
||||
(timeReleaseGarbage - beginTime).TotalMilliseconds,
|
||||
(timeStoreAlive - timeReleaseGarbage).TotalMilliseconds
|
||||
);
|
||||
//logger.LogDebug("check end : released {0} garbage, remain {1} alive, used {2} ms : release garbage used {3} ms, store alive used {4} ms",
|
||||
// garbageCount, weakReferences.Count, (DateTime.Now - beginTime).TotalMilliseconds,
|
||||
// (timeReleaseGarbage - beginTime).TotalMilliseconds,
|
||||
// (timeStoreAlive - timeReleaseGarbage).TotalMilliseconds
|
||||
// );
|
||||
}
|
||||
|
||||
private List<WeakReferenceObjectIdPair> ReleseGarbageCollectedObjects(int checkCount, out int garbageCount)
|
||||
|
@ -208,7 +208,7 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
i++;
|
||||
if (i >= checkCount)
|
||||
{
|
||||
logger.LogDebug("Stop releasing as exceeded allowed checkCount: {0}", checkCount);
|
||||
//logger.LogDebug("Stop releasing as exceeded allowed checkCount: {0}", checkCount);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -238,7 +238,7 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
|
||||
public virtual void Dispose()
|
||||
{
|
||||
logger.LogInfo("Dispose {0}", this.GetType());
|
||||
//logger.LogInfo("Dispose {0}", this.GetType());
|
||||
shouldKeepRunning = false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -31,13 +31,13 @@ namespace Microsoft.Spark.CSharp.Proxy
|
|||
string Name { get; }
|
||||
void SetName(string name);
|
||||
IRDDProxy RandomSampleWithRange(double lb, double ub, long seed);
|
||||
IRDDProxy SampleByKey(bool withReplacement, Dictionary<string, double> fractions, long seed);
|
||||
IRDDProxy SampleByKey(bool withReplacement, IEnumerable<Tuple<string, double>> fractions, long seed);
|
||||
IRDDProxy Zip(IRDDProxy other);
|
||||
string ToDebugString();
|
||||
void SaveAsNewAPIHadoopDataset(IEnumerable<KeyValuePair<string, string>> conf);
|
||||
void SaveAsNewAPIHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf);
|
||||
void SaveAsHadoopDataset(IEnumerable<KeyValuePair<string, string>> conf);
|
||||
void SaveAsHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf, string compressionCodecClass);
|
||||
void SaveAsNewAPIHadoopDataset(IEnumerable<Tuple<string, string>> conf);
|
||||
void SaveAsNewAPIHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<Tuple<string, string>> conf);
|
||||
void SaveAsHadoopDataset(IEnumerable<Tuple<string, string>> conf);
|
||||
void SaveAsHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<Tuple<string, string>> conf, string compressionCodecClass);
|
||||
void SaveAsSequenceFile(string path, string compressionCodecClass);
|
||||
void SaveAsTextFile(string path, string compressionCodecClass);
|
||||
long Count();
|
||||
|
|
|
@ -35,10 +35,10 @@ namespace Microsoft.Spark.CSharp.Proxy
|
|||
IRDDProxy WholeTextFiles(string filePath, int minPartitions);
|
||||
IRDDProxy BinaryFiles(string filePath, int minPartitions);
|
||||
IRDDProxy SequenceFile(string filePath, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, int minSplits, int batchSize);
|
||||
IRDDProxy NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize);
|
||||
IRDDProxy NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize);
|
||||
IRDDProxy HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize);
|
||||
IRDDProxy HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize);
|
||||
IRDDProxy NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize);
|
||||
IRDDProxy NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize);
|
||||
IRDDProxy HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize);
|
||||
IRDDProxy HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize);
|
||||
IRDDProxy CheckpointFile(string filePath);
|
||||
IRDDProxy Union(IEnumerable<IRDDProxy> rdds);
|
||||
void AddFile(string path);
|
||||
|
|
|
@ -10,12 +10,15 @@ using Microsoft.Spark.CSharp.Sql;
|
|||
|
||||
namespace Microsoft.Spark.CSharp.Proxy
|
||||
{
|
||||
internal interface IUdfRegistration { }
|
||||
internal interface IUdfRegistrationProxy
|
||||
{
|
||||
void RegisterFunction(string name, byte[] command, string returnType);
|
||||
}
|
||||
|
||||
interface ISparkSessionProxy
|
||||
{
|
||||
ISqlContextProxy SqlContextProxy { get; }
|
||||
IUdfRegistration Udf { get; }
|
||||
IUdfRegistrationProxy Udf { get; }
|
||||
ICatalogProxy GetCatalog();
|
||||
IDataFrameReaderProxy Read();
|
||||
ISparkSessionProxy NewSession();
|
||||
|
|
|
@ -3,9 +3,6 @@
|
|||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
|
||||
|
@ -20,10 +17,9 @@ namespace Microsoft.Spark.CSharp.Proxy
|
|||
void Checkpoint(string directory);
|
||||
IDStreamProxy TextFileStream(string directory);
|
||||
IDStreamProxy SocketTextStream(string hostname, int port, StorageLevelType storageLevelType);
|
||||
IDStreamProxy KafkaStream(Dictionary<string, int> topics, Dictionary<string, string> kafkaParams, StorageLevelType storageLevelType);
|
||||
IDStreamProxy DirectKafkaStream(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets);
|
||||
IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets,
|
||||
int numPartitions, byte[] readFunc, string serializationMode);
|
||||
IDStreamProxy KafkaStream(IEnumerable<Tuple<string, int>> topics, IEnumerable<Tuple<string, string>> kafkaParams, StorageLevelType storageLevelType);
|
||||
IDStreamProxy DirectKafkaStream(List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams, IEnumerable<Tuple<string, long>> fromOffsets);
|
||||
IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams, IEnumerable<Tuple<string, long>> fromOffsets, int numPartitions, byte[] readFunc, string serializationMode);
|
||||
IDStreamProxy Union(IDStreamProxy firstDStreams, IDStreamProxy[] otherDStreams);
|
||||
void AwaitTermination();
|
||||
void AwaitTerminationOrTimeout(long timeout);
|
||||
|
@ -33,7 +29,6 @@ namespace Microsoft.Spark.CSharp.Proxy
|
|||
IDStreamProxy CreateCSharpStateDStream(IDStreamProxy jdstream, byte[] func, string className, string serializationMode, string serializationMode2);
|
||||
IDStreamProxy CreateConstantInputDStream(IRDDProxy rddProxy);
|
||||
IDStreamProxy CreateCSharpInputDStream(byte[] func, string serializationMode);
|
||||
IDStreamProxy EventHubsUnionStream(Dictionary<string, string> eventHubsParams, StorageLevelType storageLevelType);
|
||||
|
||||
IDStreamProxy EventHubsUnionStream(IEnumerable<Tuple<string, string>> eventHubsParams, StorageLevelType storageLevelType);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -158,7 +158,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "setName", new object[] { name });
|
||||
}
|
||||
|
||||
public IRDDProxy SampleByKey(bool withReplacement, Dictionary<string, double> fractions, long seed)
|
||||
public IRDDProxy SampleByKey(bool withReplacement, IEnumerable<Tuple<string, double>> fractions, long seed)
|
||||
{
|
||||
var jfractions = JvmBridgeUtils.GetJavaMap(fractions) as JvmObjectReference;
|
||||
return new RDDIpcProxy(new JvmObjectReference((string) SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "sampleByKey", new object[] { withReplacement, jfractions, seed })));
|
||||
|
@ -176,25 +176,25 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
return new RDDIpcProxy(new JvmObjectReference((string) SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "zip", new object[] { (other as RDDIpcProxy).jvmRddReference })));
|
||||
}
|
||||
|
||||
public void SaveAsNewAPIHadoopDataset(IEnumerable<KeyValuePair<string, string>> conf)
|
||||
public void SaveAsNewAPIHadoopDataset(IEnumerable<Tuple<string, string>> conf)
|
||||
{
|
||||
var jconf = JvmBridgeUtils.GetJavaMap<string, string>(conf);
|
||||
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "saveAsHadoopDataset", new object[] { jvmRddReference, false, jconf, null, null, true });
|
||||
}
|
||||
|
||||
public void SaveAsNewAPIHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf)
|
||||
public void SaveAsNewAPIHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<Tuple<string, string>> conf)
|
||||
{
|
||||
var jconf = JvmBridgeUtils.GetJavaMap<string, string>(conf);
|
||||
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "saveAsNewAPIHadoopFile", new object[] { jvmRddReference, false, path, outputFormatClass, keyClass, valueClass, null, null, jconf });
|
||||
}
|
||||
|
||||
public void SaveAsHadoopDataset(IEnumerable<KeyValuePair<string, string>> conf)
|
||||
public void SaveAsHadoopDataset(IEnumerable<Tuple<string, string>> conf)
|
||||
{
|
||||
var jconf = JvmBridgeUtils.GetJavaMap<string, string>(conf);
|
||||
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "saveAsHadoopDataset", new object[] { jvmRddReference, false, jconf, null, null, false });
|
||||
}
|
||||
|
||||
public void SaveAsHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf, string compressionCodecClass)
|
||||
public void SaveAsHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<Tuple<string, string>> conf, string compressionCodecClass)
|
||||
{
|
||||
var jconf = JvmBridgeUtils.GetJavaMap<string, string>(conf);
|
||||
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "saveAsHadoopFile", new object[] { jvmRddReference, false, path, outputFormatClass, keyClass, valueClass, null, null, jconf, compressionCodecClass });
|
||||
|
|
|
@ -183,7 +183,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
return new RDDIpcProxy(jvmRddReference);
|
||||
}
|
||||
|
||||
public IRDDProxy NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
|
||||
public IRDDProxy NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize)
|
||||
{
|
||||
var jconf = JvmBridgeUtils.GetJavaHashMap<string, string>(conf);
|
||||
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "newAPIHadoopFile",
|
||||
|
@ -191,7 +191,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
return new RDDIpcProxy(jvmRddReference);
|
||||
}
|
||||
|
||||
public IRDDProxy NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
|
||||
public IRDDProxy NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize)
|
||||
{
|
||||
var jconf = JvmBridgeUtils.GetJavaHashMap<string, string>(conf);
|
||||
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "newAPIHadoopRDD",
|
||||
|
@ -199,7 +199,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
return new RDDIpcProxy(jvmRddReference);
|
||||
}
|
||||
|
||||
public IRDDProxy HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
|
||||
public IRDDProxy HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize)
|
||||
{
|
||||
var jconf = JvmBridgeUtils.GetJavaHashMap<string, string>(conf);
|
||||
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "hadoopFile",
|
||||
|
@ -207,7 +207,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
return new RDDIpcProxy(jvmRddReference);
|
||||
}
|
||||
|
||||
public IRDDProxy HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
|
||||
public IRDDProxy HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize)
|
||||
{
|
||||
var jconf = JvmBridgeUtils.GetJavaHashMap<string, string>(conf);
|
||||
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "hadoopRDD",
|
||||
|
|
|
@ -17,18 +17,13 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
private readonly JvmObjectReference jvmSparkSessionReference;
|
||||
private readonly ISqlContextProxy sqlContextProxy;
|
||||
|
||||
private readonly IUdfRegistration udfRegistration;
|
||||
private readonly IUdfRegistrationProxy udfRegistrationProxy;
|
||||
|
||||
public IUdfRegistration Udf
|
||||
public IUdfRegistrationProxy Udf
|
||||
{
|
||||
get
|
||||
{
|
||||
if (udfRegistration == null)
|
||||
{
|
||||
//TODO implementation needed
|
||||
}
|
||||
|
||||
return udfRegistration;
|
||||
return udfRegistrationProxy;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -46,6 +41,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
{
|
||||
this.jvmSparkSessionReference = jvmSparkSessionReference;
|
||||
sqlContextProxy = new SqlContextIpcProxy(GetSqlContextReference());
|
||||
udfRegistrationProxy = new UdfRegistrationIpcProxy(sqlContextProxy);
|
||||
}
|
||||
|
||||
private JvmObjectReference GetSqlContextReference()
|
||||
|
@ -98,4 +94,19 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSparkSessionReference, "stop");
|
||||
}
|
||||
}
|
||||
|
||||
[ExcludeFromCodeCoverage] //IPC calls to JVM validated using validation-enabled samples - unit test coverage not reqiured
|
||||
internal class UdfRegistrationIpcProxy : IUdfRegistrationProxy
|
||||
{
|
||||
private readonly ISqlContextProxy sqlContextProxy;
|
||||
internal UdfRegistrationIpcProxy(ISqlContextProxy sqlContextProxy)
|
||||
{
|
||||
this.sqlContextProxy = sqlContextProxy;
|
||||
}
|
||||
|
||||
public void RegisterFunction(string name, byte[] command, string returnType)
|
||||
{
|
||||
sqlContextProxy.RegisterFunction(name, command, returnType);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -197,7 +197,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
return new DStreamIpcProxy(jstream);
|
||||
}
|
||||
|
||||
public IDStreamProxy KafkaStream(Dictionary<string, int> topics, Dictionary<string, string> kafkaParams, StorageLevelType storageLevelType)
|
||||
public IDStreamProxy KafkaStream(IEnumerable<Tuple<string, int>> topics, IEnumerable<Tuple<string, string>> kafkaParams, StorageLevelType storageLevelType)
|
||||
{
|
||||
JvmObjectReference jtopics = JvmBridgeUtils.GetJavaMap<string, int>(topics);
|
||||
JvmObjectReference jkafkaParams = JvmBridgeUtils.GetJavaMap<string, string>(kafkaParams);
|
||||
|
@ -208,16 +208,16 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
return new DStreamIpcProxy(jstream);
|
||||
}
|
||||
|
||||
public IDStreamProxy DirectKafkaStream(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets)
|
||||
public IDStreamProxy DirectKafkaStream(List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams, IEnumerable<Tuple<string, long>> fromOffsets)
|
||||
{
|
||||
JvmObjectReference jtopics = JvmBridgeUtils.GetJavaSet<string>(topics);
|
||||
JvmObjectReference jkafkaParams = JvmBridgeUtils.GetJavaMap<string, string>(kafkaParams);
|
||||
|
||||
var jTopicAndPartitions = fromOffsets.Select(x =>
|
||||
new KeyValuePair<JvmObjectReference, long>
|
||||
new Tuple<JvmObjectReference, long>
|
||||
(
|
||||
SparkCLRIpcProxy.JvmBridge.CallConstructor("kafka.common.TopicAndPartition", new object[] { x.Key.Split(':')[0], int.Parse(x.Key.Split(':')[1]) }),
|
||||
x.Value
|
||||
SparkCLRIpcProxy.JvmBridge.CallConstructor("kafka.common.TopicAndPartition", new object[] { x.Item1.Split(':')[0], int.Parse(x.Item1.Split(':')[1]) }),
|
||||
x.Item2
|
||||
)
|
||||
);
|
||||
|
||||
|
@ -228,17 +228,16 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
return new DStreamIpcProxy(jstream);
|
||||
}
|
||||
|
||||
public IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, Dictionary<string, string> kafkaParams,
|
||||
Dictionary<string, long> fromOffsets, int numPartitions, byte[] readFunc, string serializationMode)
|
||||
public IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams, IEnumerable<Tuple<string, long>> fromOffsets, int numPartitions, byte[] readFunc, string serializationMode)
|
||||
{
|
||||
JvmObjectReference jtopics = JvmBridgeUtils.GetJavaSet<string>(topics);
|
||||
JvmObjectReference jkafkaParams = JvmBridgeUtils.GetJavaMap<string, string>(kafkaParams);
|
||||
|
||||
var jTopicAndPartitions = fromOffsets.Select(x =>
|
||||
new KeyValuePair<JvmObjectReference, long>
|
||||
new Tuple<JvmObjectReference, long>
|
||||
(
|
||||
SparkCLRIpcProxy.JvmBridge.CallConstructor("kafka.common.TopicAndPartition", new object[] { x.Key.Split(':')[0], int.Parse(x.Key.Split(':')[1]) }),
|
||||
x.Value
|
||||
SparkCLRIpcProxy.JvmBridge.CallConstructor("kafka.common.TopicAndPartition", new object[] { x.Item1.Split(':')[0], int.Parse(x.Item1.Split(':')[1]) }),
|
||||
x.Item2
|
||||
)
|
||||
);
|
||||
|
||||
|
@ -250,7 +249,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
return new DStreamIpcProxy(jstream);
|
||||
}
|
||||
|
||||
public IDStreamProxy EventHubsUnionStream(Dictionary<string, string> eventHubsParams, StorageLevelType storageLevelType)
|
||||
public IDStreamProxy EventHubsUnionStream(IEnumerable<Tuple<string, string>> eventHubsParams, StorageLevelType storageLevelType)
|
||||
{
|
||||
JvmObjectReference eventHubsParamsReference = JvmBridgeUtils.GetScalaMutableMap<string, string>(eventHubsParams);
|
||||
JvmObjectReference storageLevelTypeReference = SparkContextIpcProxy.GetJavaStorageLevel(storageLevelType);
|
||||
|
|
|
@ -48,6 +48,11 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
get { return sparkContext; }
|
||||
}
|
||||
|
||||
public UdfRegistration Udf
|
||||
{
|
||||
get { return new UdfRegistration(sparkSessionProxy.Udf); }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builder for SparkSession
|
||||
/// </summary>
|
||||
|
|
|
@ -0,0 +1,254 @@
|
|||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics.CodeAnalysis;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Proxy;
|
||||
using Microsoft.Spark.CSharp.Services;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Sql
|
||||
{
|
||||
/// <summary>
|
||||
/// Used for registering User Defined Functions. SparkSession.Udf is used to access instance of this type.
|
||||
/// </summary>
|
||||
public class UdfRegistration
|
||||
{
|
||||
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(UdfRegistration));
|
||||
|
||||
private IUdfRegistrationProxy udfRegistrationProxy;
|
||||
|
||||
internal UdfRegistration(IUdfRegistrationProxy udfRegistrationProxy)
|
||||
{
|
||||
this.udfRegistrationProxy = udfRegistrationProxy;
|
||||
}
|
||||
|
||||
//TODO - the following section is a copy of the same functionality in SQLContext..refactoring needed
|
||||
#region UDF Registration
|
||||
/// <summary>
|
||||
/// Register UDF with no input argument, e.g:
|
||||
/// SqlContext.RegisterFunction<bool>("MyFilter", () => true);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter()");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
/// <param name="name"></param>
|
||||
/// <param name="f"></param>
|
||||
public void RegisterFunction<RT>(string name, Func<RT> f)
|
||||
{
|
||||
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
|
||||
|
||||
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT>(f).Execute;
|
||||
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Register UDF with 1 input argument, e.g:
|
||||
/// SqlContext.RegisterFunction<bool, string>("MyFilter", (arg1) => arg1 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
/// <typeparam name="A1"></typeparam>
|
||||
/// <param name="name"></param>
|
||||
/// <param name="f"></param>
|
||||
public void RegisterFunction<RT, A1>(string name, Func<A1, RT> f)
|
||||
{
|
||||
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
|
||||
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1>(f).Execute;
|
||||
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Register UDF with 2 input arguments, e.g:
|
||||
/// SqlContext.RegisterFunction<bool, string, string>("MyFilter", (arg1, arg2) => arg1 != null && arg2 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
/// <typeparam name="A1"></typeparam>
|
||||
/// <typeparam name="A2"></typeparam>
|
||||
/// <param name="name"></param>
|
||||
/// <param name="f"></param>
|
||||
public void RegisterFunction<RT, A1, A2>(string name, Func<A1, A2, RT> f)
|
||||
{
|
||||
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
|
||||
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2>(f).Execute;
|
||||
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Register UDF with 3 input arguments, e.g:
|
||||
/// SqlContext.RegisterFunction<bool, string, string, string>("MyFilter", (arg1, arg2, arg3) => arg1 != null && arg2 != null && arg3 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, columnName3)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
/// <typeparam name="A1"></typeparam>
|
||||
/// <typeparam name="A2"></typeparam>
|
||||
/// <typeparam name="A3"></typeparam>
|
||||
/// <param name="name"></param>
|
||||
/// <param name="f"></param>
|
||||
public void RegisterFunction<RT, A1, A2, A3>(string name, Func<A1, A2, A3, RT> f)
|
||||
{
|
||||
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
|
||||
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2, A3>(f).Execute;
|
||||
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Register UDF with 4 input arguments, e.g:
|
||||
/// SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg4) => arg1 != null && arg2 != null && ... && arg3 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName4)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
/// <typeparam name="A1"></typeparam>
|
||||
/// <typeparam name="A2"></typeparam>
|
||||
/// <typeparam name="A3"></typeparam>
|
||||
/// <typeparam name="A4"></typeparam>
|
||||
/// <param name="name"></param>
|
||||
/// <param name="f"></param>
|
||||
public void RegisterFunction<RT, A1, A2, A3, A4>(string name, Func<A1, A2, A3, A4, RT> f)
|
||||
{
|
||||
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
|
||||
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2, A3, A4>(f).Execute;
|
||||
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Register UDF with 5 input arguments, e.g:
|
||||
/// SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg5) => arg1 != null && arg2 != null && ... && arg5 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName5)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
/// <typeparam name="A1"></typeparam>
|
||||
/// <typeparam name="A2"></typeparam>
|
||||
/// <typeparam name="A3"></typeparam>
|
||||
/// <typeparam name="A4"></typeparam>
|
||||
/// <typeparam name="A5"></typeparam>
|
||||
/// <param name="name"></param>
|
||||
/// <param name="f"></param>
|
||||
public void RegisterFunction<RT, A1, A2, A3, A4, A5>(string name, Func<A1, A2, A3, A4, A5, RT> f)
|
||||
{
|
||||
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
|
||||
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2, A3, A4, A5>(f).Execute;
|
||||
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Register UDF with 6 input arguments, e.g:
|
||||
/// SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg6) => arg1 != null && arg2 != null && ... && arg6 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName6)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
/// <typeparam name="A1"></typeparam>
|
||||
/// <typeparam name="A2"></typeparam>
|
||||
/// <typeparam name="A3"></typeparam>
|
||||
/// <typeparam name="A4"></typeparam>
|
||||
/// <typeparam name="A5"></typeparam>
|
||||
/// <typeparam name="A6"></typeparam>
|
||||
/// <param name="name"></param>
|
||||
/// <param name="f"></param>
|
||||
public void RegisterFunction<RT, A1, A2, A3, A4, A5, A6>(string name, Func<A1, A2, A3, A4, A5, A6, RT> f)
|
||||
{
|
||||
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
|
||||
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2, A3, A4, A5, A6>(f).Execute;
|
||||
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Register UDF with 7 input arguments, e.g:
|
||||
/// SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg7) => arg1 != null && arg2 != null && ... && arg7 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName7)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
/// <typeparam name="A1"></typeparam>
|
||||
/// <typeparam name="A2"></typeparam>
|
||||
/// <typeparam name="A3"></typeparam>
|
||||
/// <typeparam name="A4"></typeparam>
|
||||
/// <typeparam name="A5"></typeparam>
|
||||
/// <typeparam name="A6"></typeparam>
|
||||
/// <typeparam name="A7"></typeparam>
|
||||
/// <param name="name"></param>
|
||||
/// <param name="f"></param>
|
||||
public void RegisterFunction<RT, A1, A2, A3, A4, A5, A6, A7>(string name, Func<A1, A2, A3, A4, A5, A6, A7, RT> f)
|
||||
{
|
||||
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
|
||||
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2, A3, A4, A5, A6, A7>(f).Execute;
|
||||
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Register UDF with 8 input arguments, e.g:
|
||||
/// SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg8) => arg1 != null && arg2 != null && ... && arg8 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName8)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
/// <typeparam name="A1"></typeparam>
|
||||
/// <typeparam name="A2"></typeparam>
|
||||
/// <typeparam name="A3"></typeparam>
|
||||
/// <typeparam name="A4"></typeparam>
|
||||
/// <typeparam name="A5"></typeparam>
|
||||
/// <typeparam name="A6"></typeparam>
|
||||
/// <typeparam name="A7"></typeparam>
|
||||
/// <typeparam name="A8"></typeparam>
|
||||
/// <param name="name"></param>
|
||||
/// <param name="f"></param>
|
||||
public void RegisterFunction<RT, A1, A2, A3, A4, A5, A6, A7, A8>(string name, Func<A1, A2, A3, A4, A5, A6, A7, A8, RT> f)
|
||||
{
|
||||
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
|
||||
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2, A3, A4, A5, A6, A7, A8>(f).Execute;
|
||||
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Register UDF with 9 input arguments, e.g:
|
||||
/// SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg9) => arg1 != null && arg2 != null && ... && arg9 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName9)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
/// <typeparam name="A1"></typeparam>
|
||||
/// <typeparam name="A2"></typeparam>
|
||||
/// <typeparam name="A3"></typeparam>
|
||||
/// <typeparam name="A4"></typeparam>
|
||||
/// <typeparam name="A5"></typeparam>
|
||||
/// <typeparam name="A6"></typeparam>
|
||||
/// <typeparam name="A7"></typeparam>
|
||||
/// <typeparam name="A8"></typeparam>
|
||||
/// <typeparam name="A9"></typeparam>
|
||||
/// <param name="name"></param>
|
||||
/// <param name="f"></param>
|
||||
public void RegisterFunction<RT, A1, A2, A3, A4, A5, A6, A7, A8, A9>(string name, Func<A1, A2, A3, A4, A5, A6, A7, A8, A9, RT> f)
|
||||
{
|
||||
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
|
||||
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2, A3, A4, A5, A6, A7, A8, A9>(f).Execute;
|
||||
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Register UDF with 10 input arguments, e.g:
|
||||
/// SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg10) => arg1 != null && arg2 != null && ... && arg10 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName10)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
/// <typeparam name="A1"></typeparam>
|
||||
/// <typeparam name="A2"></typeparam>
|
||||
/// <typeparam name="A3"></typeparam>
|
||||
/// <typeparam name="A4"></typeparam>
|
||||
/// <typeparam name="A5"></typeparam>
|
||||
/// <typeparam name="A6"></typeparam>
|
||||
/// <typeparam name="A7"></typeparam>
|
||||
/// <typeparam name="A8"></typeparam>
|
||||
/// <typeparam name="A9"></typeparam>
|
||||
/// <typeparam name="A10"></typeparam>
|
||||
/// <param name="name"></param>
|
||||
/// <param name="f"></param>
|
||||
public void RegisterFunction<RT, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10>(string name, Func<A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT> f)
|
||||
{
|
||||
logger.LogInfo("Name of the function to register {0}, method info", name, f.Method);
|
||||
Func<int, IEnumerable<dynamic>, IEnumerable<dynamic>> udfHelper = new UdfHelper<RT, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10>(f).Execute;
|
||||
udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT)));
|
||||
}
|
||||
#endregion
|
||||
}
|
||||
}
|
|
@ -153,7 +153,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <returns></returns>
|
||||
public DStream<T> Reduce(Func<T, T, T> f)
|
||||
{
|
||||
return Map<KeyValuePair<string, T>>(x => new KeyValuePair<string, T>(string.Empty, x)).ReduceByKey(f, 1).Map<T>(kvp => kvp.Value);
|
||||
return Map<Tuple<string, T>>(x => new Tuple<string, T>(string.Empty, x)).ReduceByKey(f, 1).Map<T>(kvp => kvp.Item2);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -235,9 +235,9 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// distinct value in each RDD of this DStream.
|
||||
/// </summary>
|
||||
/// <returns></returns>
|
||||
public DStream<KeyValuePair<T, long>> CountByValue(int numPartitions = 0)
|
||||
public DStream<Tuple<T, long>> CountByValue(int numPartitions = 0)
|
||||
{
|
||||
return Map(v => new KeyValuePair<T, long>(v, 1L)).ReduceByKey((x, y) => x + y, numPartitions);
|
||||
return Map(v => new Tuple<T, long>(v, 1L)).ReduceByKey((x, y) => x + y, numPartitions);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -427,9 +427,9 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <returns></returns>
|
||||
public DStream<T> ReduceByWindow(Func<T, T, T> reduceFunc, Func<T, T, T> invReduceFunc, int windowSeconds, int slideSeconds = 0)
|
||||
{
|
||||
var keyed = Map(v => new KeyValuePair<int, T>(1, v));
|
||||
var keyed = Map(v => new Tuple<int, T>(1, v));
|
||||
var reduced = keyed.ReduceByKeyAndWindow(reduceFunc, invReduceFunc, windowSeconds, slideSeconds, 1);
|
||||
return reduced.Map(kv => (T)kv.Value);
|
||||
return reduced.Map(kv => (T)kv.Item2);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -462,9 +462,9 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <returns></returns>
|
||||
public DStream<long> CountByValueAndWindow(int windowSeconds, int slideSeconds, int numPartitions = 0)
|
||||
{
|
||||
var keyed = Map(v => new KeyValuePair<T, int>(v, 1));
|
||||
var keyed = Map(v => new Tuple<T, int>(v, 1));
|
||||
var counted = keyed.ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y, windowSeconds, slideSeconds, numPartitions);
|
||||
return counted.Filter(kv => kv.Value > 0).Count();
|
||||
return counted.Filter(kv => kv.Item2 > 0).Count();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// </param>
|
||||
/// <param name="storageLevelType">Storage level, by default it is MEMORY_ONLY</param>
|
||||
/// <returns>DStream with byte[] representing events from EventHub</returns>
|
||||
public static DStream<byte[]> CreateUnionStream(StreamingContext ssc, Dictionary<string, string> eventhubsParams, StorageLevelType storageLevelType = StorageLevelType.MEMORY_ONLY)
|
||||
public static DStream<byte[]> CreateUnionStream(StreamingContext ssc, IEnumerable<Tuple<string, string>> eventhubsParams, StorageLevelType storageLevelType = StorageLevelType.MEMORY_ONLY)
|
||||
{
|
||||
return new DStream<byte[]>(ssc.streamingContextProxy.EventHubsUnionStream(eventhubsParams, storageLevelType), ssc, SerializedMode.None);
|
||||
}
|
||||
|
|
|
@ -28,7 +28,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="topics">Dict of (topic_name -> numPartitions) to consume. Each partition is consumed in its own thread.</param>
|
||||
/// <param name="kafkaParams">Additional params for Kafka</param>
|
||||
/// <returns>A DStream object</returns>
|
||||
public static DStream<KeyValuePair<byte[], byte[]>> CreateStream(StreamingContext ssc, string zkQuorum, string groupId, Dictionary<string, int> topics, Dictionary<string, string> kafkaParams)
|
||||
public static DStream<Tuple<byte[], byte[]>> CreateStream(StreamingContext ssc, string zkQuorum, string groupId, IEnumerable<Tuple<string, int>> topics, IEnumerable<Tuple<string, string>> kafkaParams)
|
||||
{
|
||||
return CreateStream(ssc, zkQuorum, groupId, topics, kafkaParams, StorageLevelType.MEMORY_AND_DISK_SER_2);
|
||||
}
|
||||
|
@ -43,19 +43,21 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="kafkaParams">Additional params for Kafka</param>
|
||||
/// <param name="storageLevelType">RDD storage level.</param>
|
||||
/// <returns>A DStream object</returns>
|
||||
public static DStream<KeyValuePair<byte[], byte[]>> CreateStream(StreamingContext ssc, string zkQuorum, string groupId, Dictionary<string, int> topics, Dictionary<string, string> kafkaParams, StorageLevelType storageLevelType)
|
||||
public static DStream<Tuple<byte[], byte[]>> CreateStream(StreamingContext ssc, string zkQuorum, string groupId, IEnumerable<Tuple<string, int>> topics, IEnumerable<Tuple<string, string>> kafkaParams, StorageLevelType storageLevelType)
|
||||
{
|
||||
if (kafkaParams == null)
|
||||
kafkaParams = new Dictionary<string, string>();
|
||||
kafkaParams = new List<Tuple<string, string>>();
|
||||
|
||||
var kafkaParamsMap = kafkaParams.ToDictionary(x => x.Item1, x => x.Item2);
|
||||
|
||||
if (!string.IsNullOrEmpty(zkQuorum))
|
||||
kafkaParams["zookeeper.connect"] = zkQuorum;
|
||||
kafkaParamsMap["zookeeper.connect"] = zkQuorum;
|
||||
if (groupId != null)
|
||||
kafkaParams["group.id"] = groupId;
|
||||
if (kafkaParams.ContainsKey("zookeeper.connection.timeout.ms"))
|
||||
kafkaParams["zookeeper.connection.timeout.ms"] = "10000";
|
||||
kafkaParamsMap["group.id"] = groupId;
|
||||
if (kafkaParamsMap.ContainsKey("zookeeper.connection.timeout.ms"))
|
||||
kafkaParamsMap["zookeeper.connection.timeout.ms"] = "10000";
|
||||
|
||||
return new DStream<KeyValuePair<byte[], byte[]>>(ssc.streamingContextProxy.KafkaStream(topics, kafkaParams, storageLevelType), ssc);
|
||||
return new DStream<Tuple<byte[], byte[]>>(ssc.streamingContextProxy.KafkaStream(topics, kafkaParamsMap.Select(x => Tuple.Create(x.Key, x.Value)), storageLevelType), ssc);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -82,7 +84,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// </param>
|
||||
/// <param name="fromOffsets">Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream.</param>
|
||||
/// <returns>A DStream object</returns>
|
||||
public static DStream<KeyValuePair<byte[], byte[]>> CreateDirectStream(StreamingContext ssc, List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets)
|
||||
public static DStream<Tuple<byte[], byte[]>> CreateDirectStream(StreamingContext ssc, List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams, IEnumerable<Tuple<string, long>> fromOffsets)
|
||||
{
|
||||
int numPartitions = GetNumPartitionsFromConfig(ssc, topics, kafkaParams);
|
||||
if (numPartitions >= 0 ||
|
||||
|
@ -90,9 +92,9 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
ssc.SparkContext.SparkConf.SparkConfProxy.GetInt("spark.mobius.streaming.kafka.numReceivers", 0) > 0 ||
|
||||
topics.Any(topic => ssc.SparkContext.SparkConf.SparkConfProxy.GetInt("spark.mobius.streaming.kafka.maxMessagesPerTask." + topic, 0) > 0))
|
||||
{
|
||||
return new DStream<KeyValuePair<byte[], byte[]>>(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, null, null), ssc, SerializedMode.Pair);
|
||||
return new DStream<Tuple<byte[], byte[]>>(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, null, null), ssc, SerializedMode.Pair);
|
||||
}
|
||||
return new DStream<KeyValuePair<byte[], byte[]>>(ssc.streamingContextProxy.DirectKafkaStream(topics, kafkaParams, fromOffsets), ssc, SerializedMode.Pair);
|
||||
return new DStream<Tuple<byte[], byte[]>>(ssc.streamingContextProxy.DirectKafkaStream(topics, kafkaParams, fromOffsets), ssc, SerializedMode.Pair);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -120,18 +122,18 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="fromOffsets">Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream.</param>
|
||||
/// <param name="readFunc">user function to process the kafka data.</param>
|
||||
/// <returns>A DStream object</returns>
|
||||
public static DStream<T> CreateDirectStream<T>(StreamingContext ssc, List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets, Func<int, IEnumerable<KeyValuePair<byte[], byte[]>>, IEnumerable<T>> readFunc)
|
||||
public static DStream<T> CreateDirectStream<T>(StreamingContext ssc, List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams, IEnumerable<Tuple<string, long>> fromOffsets, Func<int, IEnumerable<Tuple<byte[], byte[]>>, IEnumerable<T>> readFunc)
|
||||
{
|
||||
int numPartitions = GetNumPartitionsFromConfig(ssc, topics, kafkaParams);
|
||||
if (ssc.SparkContext.SparkConf.SparkConfProxy.GetInt("spark.mobius.streaming.kafka.numReceivers", 0) <= 0)
|
||||
{
|
||||
var dstream = new DStream<KeyValuePair<byte[], byte[]>>(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, null, null), ssc, SerializedMode.Pair);
|
||||
var dstream = new DStream<Tuple<byte[], byte[]>>(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, null, null), ssc, SerializedMode.Pair);
|
||||
return dstream.MapPartitionsWithIndex(readFunc, true);
|
||||
}
|
||||
|
||||
var mapPartitionsWithIndexHelper = new MapPartitionsWithIndexHelper<KeyValuePair<byte[], byte[]>, T>(readFunc, true);
|
||||
var transformHelper = new TransformHelper<KeyValuePair<byte[], byte[]>, T>(mapPartitionsWithIndexHelper.Execute);
|
||||
var transformDynamicHelper = new TransformDynamicHelper<KeyValuePair<byte[], byte[]>, T>(transformHelper.Execute);
|
||||
var mapPartitionsWithIndexHelper = new MapPartitionsWithIndexHelper<Tuple<byte[], byte[]>, T>(readFunc, true);
|
||||
var transformHelper = new TransformHelper<Tuple<byte[], byte[]>, T>(mapPartitionsWithIndexHelper.Execute);
|
||||
var transformDynamicHelper = new TransformDynamicHelper<Tuple<byte[], byte[]>, T>(transformHelper.Execute);
|
||||
Func<double, RDD<dynamic>, RDD<dynamic>> func = transformDynamicHelper.Execute;
|
||||
var formatter = new BinaryFormatter();
|
||||
var stream = new MemoryStream();
|
||||
|
@ -146,11 +148,11 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// </summary>
|
||||
/// <param name="input"></param>
|
||||
/// <returns></returns>
|
||||
public static OffsetRange GetOffsetRange(IEnumerable<KeyValuePair<byte[], byte[]>> input)
|
||||
public static OffsetRange GetOffsetRange(IEnumerable<Tuple<byte[], byte[]>> input)
|
||||
{
|
||||
int count = 2;
|
||||
int i = 0;
|
||||
var offsetRange = new KeyValuePair<byte[], byte[]>[count];
|
||||
var offsetRange = new Tuple<byte[], byte[]>[count];
|
||||
foreach (var message in input)
|
||||
{
|
||||
offsetRange[i++ % count] = message;
|
||||
|
@ -163,12 +165,12 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
throw new ArgumentException("Expecting kafka OffsetRange metadata.");
|
||||
}
|
||||
|
||||
var topicAndClusterId = SerDe.ToString(offsetRange[0].Key);
|
||||
var topicAndClusterId = SerDe.ToString(offsetRange[0].Item1);
|
||||
var topic = topicAndClusterId.Split(',')[0];
|
||||
var clusterId = topicAndClusterId.Split(',')[1];
|
||||
var partition = SerDe.ToInt(offsetRange[0].Value);
|
||||
var fromOffset = SerDe.ReadLong(new MemoryStream(offsetRange[1].Key));
|
||||
var untilOffset = SerDe.ReadLong(new MemoryStream(offsetRange[1].Value));
|
||||
var partition = SerDe.ToInt(offsetRange[0].Item2);
|
||||
var fromOffset = SerDe.ReadLong(new MemoryStream(offsetRange[1].Item1));
|
||||
var untilOffset = SerDe.ReadLong(new MemoryStream(offsetRange[1].Item2));
|
||||
|
||||
return new OffsetRange(topic, clusterId, partition, fromOffset, untilOffset);
|
||||
}
|
||||
|
@ -181,12 +183,13 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="topics"></param>
|
||||
/// <param name="kafkaParams"></param>
|
||||
/// <returns></returns>
|
||||
private static int GetNumPartitionsFromConfig(StreamingContext ssc, List<string> topics, Dictionary<string, string> kafkaParams)
|
||||
private static int GetNumPartitionsFromConfig(StreamingContext ssc, List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams)
|
||||
{
|
||||
if (topics == null || topics.Count == 0)
|
||||
return -1;
|
||||
|
||||
string clusterId = kafkaParams.ContainsKey("cluster.id") ? "." + kafkaParams["cluster.id"] : null;
|
||||
var kafkaParamsMap = kafkaParams.ToDictionary(x => x.Item1, x => x.Item2);
|
||||
string clusterId = kafkaParamsMap.ContainsKey("cluster.id") ? "." + kafkaParamsMap["cluster.id"] : null;
|
||||
return ssc.SparkContext.SparkConf.SparkConfProxy.GetInt("spark.mobius.streaming.kafka.numPartitions." + topics[0] + clusterId, -1);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,9 +29,9 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
[Serializable]
|
||||
public class MapWithStateDStream<K, V, S, M> : DStream<M>
|
||||
{
|
||||
internal DStream<KeyValuePair<K, S>> snapshotsDStream;
|
||||
internal DStream<Tuple<K, S>> snapshotsDStream;
|
||||
|
||||
internal MapWithStateDStream(DStream<M> mappedDataDStream, DStream<KeyValuePair<K, S>> snapshotsDStream)
|
||||
internal MapWithStateDStream(DStream<M> mappedDataDStream, DStream<Tuple<K, S>> snapshotsDStream)
|
||||
: base(mappedDataDStream.DStreamProxy, mappedDataDStream.streamingContext)
|
||||
{
|
||||
this.snapshotsDStream = snapshotsDStream;
|
||||
|
@ -40,7 +40,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <summary>
|
||||
/// Return a pair DStream where each RDD is the snapshot of the state of all the keys.
|
||||
/// </summary>
|
||||
public DStream<KeyValuePair<K, S>> StateSnapshots()
|
||||
public DStream<Tuple<K, S>> StateSnapshots()
|
||||
{
|
||||
return snapshotsDStream;
|
||||
}
|
||||
|
@ -87,11 +87,11 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
{
|
||||
}
|
||||
|
||||
public MapWithStateRDDRecord(long t, IEnumerable<KeyValuePair<K, S>> iter)
|
||||
public MapWithStateRDDRecord(long t, IEnumerable<Tuple<K, S>> iter)
|
||||
{
|
||||
foreach (var p in iter)
|
||||
{
|
||||
stateMap[p.Key] = new KeyedState<S>(p.Value, t);
|
||||
stateMap[p.Item1] = new KeyedState<S>(p.Item2, t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -131,14 +131,14 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
|
||||
while (enumerator.MoveNext())
|
||||
{
|
||||
KeyValuePair<K, V> kv = enumerator.Current;
|
||||
Tuple<K, V> kv = enumerator.Current;
|
||||
KeyedState<S> keyedState;
|
||||
State<S> wrappedState = stateRddRecord.stateMap.TryGetValue(kv.Key, out keyedState) ? new State<S>(keyedState.state) : new State<S>(default(S));
|
||||
State<S> wrappedState = stateRddRecord.stateMap.TryGetValue(kv.Item1, out keyedState) ? new State<S>(keyedState.state) : new State<S>(default(S));
|
||||
|
||||
var mappedData = default(M);
|
||||
try
|
||||
{
|
||||
mappedData = f(kv.Key, kv.Value, wrappedState);
|
||||
mappedData = f(kv.Item1, kv.Item2, wrappedState);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
|
@ -149,11 +149,11 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
|
||||
if (wrappedState.removed)
|
||||
{
|
||||
stateRddRecord.stateMap.Remove(kv.Key);
|
||||
stateRddRecord.stateMap.Remove(kv.Item1);
|
||||
}
|
||||
else if (wrappedState.updated || wrappedState.defined)
|
||||
{
|
||||
stateRddRecord.stateMap[kv.Key] = new KeyedState<S>(wrappedState.state, ticks);
|
||||
stateRddRecord.stateMap[kv.Item1] = new KeyedState<S>(wrappedState.state, ticks);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -223,7 +223,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
valuesRDD = prevFunc(t, valuesRDD);
|
||||
}
|
||||
|
||||
var values = valuesRDD.ConvertTo<KeyValuePair<K, V>>().PartitionBy(stateSpec.numPartitions);
|
||||
var values = valuesRDD.ConvertTo<Tuple<K, V>>().PartitionBy(stateSpec.numPartitions);
|
||||
|
||||
if (stateRDD == null)
|
||||
{
|
||||
|
@ -259,12 +259,12 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
this.ticks = ticks;
|
||||
}
|
||||
|
||||
internal IEnumerable<MapWithStateRDDRecord<K, S, M>> Execute(IEnumerable<KeyValuePair<K, S>> iter)
|
||||
internal IEnumerable<MapWithStateRDDRecord<K, S, M>> Execute(IEnumerable<Tuple<K, S>> iter)
|
||||
{
|
||||
return new[] {new MapWithStateRDDRecord<K, S, M>(ticks, iter)};
|
||||
}
|
||||
|
||||
internal IEnumerable<MapWithStateRDDRecord<K, S, M>> ExecuteWithoutInitialState(IEnumerable<KeyValuePair<K, V>> iter)
|
||||
internal IEnumerable<MapWithStateRDDRecord<K, S, M>> ExecuteWithoutInitialState(IEnumerable<Tuple<K, V>> iter)
|
||||
{
|
||||
return new[] { new MapWithStateRDDRecord<K, S, M>() };
|
||||
}
|
||||
|
@ -283,7 +283,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
internal Func<K, V, State<S>, M> mappingFunction;
|
||||
internal int numPartitions;
|
||||
internal TimeSpan idleDuration = TimeSpan.FromTicks(0);
|
||||
internal RDD<KeyValuePair<K, S>> initialState = null;
|
||||
internal RDD<Tuple<K, S>> initialState = null;
|
||||
|
||||
/// <summary>
|
||||
/// Create a StateSpec for setting all the specifications of the `mapWithState` operation on a pair DStream.
|
||||
|
@ -325,7 +325,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// </summary>
|
||||
/// <param name="initialState">The given initial state</param>
|
||||
/// <returns>The new StateSpec object</returns>
|
||||
public StateSpec<K, V, S, M> InitialState(RDD<KeyValuePair<K, S>> initialState)
|
||||
public StateSpec<K, V, S, M> InitialState(RDD<Tuple<K, S>> initialState)
|
||||
{
|
||||
this.initialState = initialState;
|
||||
return this;
|
||||
|
|
|
@ -15,7 +15,7 @@ using Microsoft.Spark.CSharp.Interop;
|
|||
namespace Microsoft.Spark.CSharp.Streaming
|
||||
{
|
||||
/// <summary>
|
||||
/// operations only available to KeyValuePair RDD
|
||||
/// operations only available to Tuple RDD
|
||||
/// </summary>
|
||||
public static class PairDStreamFunctions
|
||||
{
|
||||
|
@ -28,7 +28,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="reduceFunc"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, V>> ReduceByKey<K, V>(this DStream<KeyValuePair<K, V>> self, Func<V, V, V> reduceFunc, int numPartitions = 0)
|
||||
public static DStream<Tuple<K, V>> ReduceByKey<K, V>(this DStream<Tuple<K, V>> self, Func<V, V, V> reduceFunc, int numPartitions = 0)
|
||||
{
|
||||
return self.CombineByKey(() => default(V), reduceFunc, reduceFunc, numPartitions);
|
||||
}
|
||||
|
@ -45,8 +45,8 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="mergeCombiners"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, C>> CombineByKey<K, V, C>(
|
||||
this DStream<KeyValuePair<K, V>> self,
|
||||
public static DStream<Tuple<K, C>> CombineByKey<K, V, C>(
|
||||
this DStream<Tuple<K, V>> self,
|
||||
Func<C> createCombiner,
|
||||
Func<C, V, C> mergeValue,
|
||||
Func<C, C, C> mergeCombiners,
|
||||
|
@ -55,7 +55,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
if (numPartitions <= 0)
|
||||
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
|
||||
|
||||
return self.Transform<KeyValuePair<K, C>>(new CombineByKeyHelper<K, V, C>(createCombiner, mergeValue, mergeCombiners, numPartitions).Execute);
|
||||
return self.Transform<Tuple<K, C>>(new CombineByKeyHelper<K, V, C>(createCombiner, mergeValue, mergeCombiners, numPartitions).Execute);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -66,12 +66,12 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="self"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, V>> PartitionBy<K, V>(this DStream<KeyValuePair<K, V>> self, int numPartitions = 0)
|
||||
public static DStream<Tuple<K, V>> PartitionBy<K, V>(this DStream<Tuple<K, V>> self, int numPartitions = 0)
|
||||
{
|
||||
if (numPartitions <= 0)
|
||||
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
|
||||
|
||||
return self.Transform<KeyValuePair<K, V>>(new PartitionByHelper<K, V>(numPartitions).Execute);
|
||||
return self.Transform<Tuple<K, V>>(new PartitionByHelper<K, V>(numPartitions).Execute);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -84,7 +84,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="self"></param>
|
||||
/// <param name="func"></param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, U>> MapValues<K, V, U>(this DStream<KeyValuePair<K, V>> self, Func<V, U> func)
|
||||
public static DStream<Tuple<K, U>> MapValues<K, V, U>(this DStream<Tuple<K, V>> self, Func<V, U> func)
|
||||
{
|
||||
return self.Map(new MapValuesHelper<K, V, U>(func).Execute, true);
|
||||
}
|
||||
|
@ -99,7 +99,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="self"></param>
|
||||
/// <param name="func"></param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, U>> FlatMapValues<K, V, U>(this DStream<KeyValuePair<K, V>> self, Func<V, IEnumerable<U>> func)
|
||||
public static DStream<Tuple<K, U>> FlatMapValues<K, V, U>(this DStream<Tuple<K, V>> self, Func<V, IEnumerable<U>> func)
|
||||
{
|
||||
return self.FlatMap(new FlatMapValuesHelper<K, V, U>(func).Execute, true);
|
||||
}
|
||||
|
@ -112,9 +112,9 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="self"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, List<V>>> GroupByKey<K, V>(this DStream<KeyValuePair<K, V>> self, int numPartitions = 0)
|
||||
public static DStream<Tuple<K, List<V>>> GroupByKey<K, V>(this DStream<Tuple<K, V>> self, int numPartitions = 0)
|
||||
{
|
||||
return self.Transform<KeyValuePair<K, List<V>>>(new GroupByKeyHelper<K, V>(numPartitions).Execute);
|
||||
return self.Transform<Tuple<K, List<V>>>(new GroupByKeyHelper<K, V>(numPartitions).Execute);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -128,12 +128,12 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="other"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, Tuple<List<V>, List<W>>>> GroupWith<K, V, W>(this DStream<KeyValuePair<K, V>> self, DStream<KeyValuePair<K, W>> other, int numPartitions = 0)
|
||||
public static DStream<Tuple<K, Tuple<List<V>, List<W>>>> GroupWith<K, V, W>(this DStream<Tuple<K, V>> self, DStream<Tuple<K, W>> other, int numPartitions = 0)
|
||||
{
|
||||
if (numPartitions <= 0)
|
||||
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
|
||||
|
||||
return self.TransformWith<KeyValuePair<K, W>, KeyValuePair<K, Tuple<List<V>, List<W>>>>(new GroupWithHelper<K, V, W>(numPartitions).Execute, other);
|
||||
return self.TransformWith<Tuple<K, W>, Tuple<K, Tuple<List<V>, List<W>>>>(new GroupWithHelper<K, V, W>(numPartitions).Execute, other);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -147,12 +147,12 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="other"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, Tuple<V, W>>> Join<K, V, W>(this DStream<KeyValuePair<K, V>> self, DStream<KeyValuePair<K, W>> other, int numPartitions = 0)
|
||||
public static DStream<Tuple<K, Tuple<V, W>>> Join<K, V, W>(this DStream<Tuple<K, V>> self, DStream<Tuple<K, W>> other, int numPartitions = 0)
|
||||
{
|
||||
if (numPartitions <= 0)
|
||||
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
|
||||
|
||||
return self.TransformWith<KeyValuePair<K, W>, KeyValuePair<K, Tuple<V, W>>>(new JoinHelper<K, V, W>(numPartitions).Execute, other);
|
||||
return self.TransformWith<Tuple<K, W>, Tuple<K, Tuple<V, W>>>(new JoinHelper<K, V, W>(numPartitions).Execute, other);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -166,12 +166,12 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="other"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, Tuple<V, Option<W>>>> LeftOuterJoin<K, V, W>(this DStream<KeyValuePair<K, V>> self, DStream<KeyValuePair<K, W>> other, int numPartitions = 0)
|
||||
public static DStream<Tuple<K, Tuple<V, Option<W>>>> LeftOuterJoin<K, V, W>(this DStream<Tuple<K, V>> self, DStream<Tuple<K, W>> other, int numPartitions = 0)
|
||||
{
|
||||
if (numPartitions <= 0)
|
||||
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
|
||||
|
||||
return self.TransformWith<KeyValuePair<K, W>, KeyValuePair<K, Tuple<V, Option<W>>>>(new LeftOuterJoinHelper<K, V, W>(numPartitions).Execute, other);
|
||||
return self.TransformWith<Tuple<K, W>, Tuple<K, Tuple<V, Option<W>>>>(new LeftOuterJoinHelper<K, V, W>(numPartitions).Execute, other);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -185,12 +185,12 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="other"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, Tuple<Option<V>, W>>> RightOuterJoin<K, V, W>(this DStream<KeyValuePair<K, V>> self, DStream<KeyValuePair<K, W>> other, int numPartitions = 0)
|
||||
public static DStream<Tuple<K, Tuple<Option<V>, W>>> RightOuterJoin<K, V, W>(this DStream<Tuple<K, V>> self, DStream<Tuple<K, W>> other, int numPartitions = 0)
|
||||
{
|
||||
if (numPartitions <= 0)
|
||||
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
|
||||
|
||||
return self.TransformWith<KeyValuePair<K, W>, KeyValuePair<K, Tuple<Option<V>, W>>>(new RightOuterJoinHelper<K, V, W>(numPartitions).Execute, other);
|
||||
return self.TransformWith<Tuple<K, W>, Tuple<K, Tuple<Option<V>, W>>>(new RightOuterJoinHelper<K, V, W>(numPartitions).Execute, other);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -204,12 +204,12 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="other"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, Tuple<Option<V>, Option<W>>>> FullOuterJoin<K, V, W>(this DStream<KeyValuePair<K, V>> self, DStream<KeyValuePair<K, W>> other, int numPartitions = 0)
|
||||
public static DStream<Tuple<K, Tuple<Option<V>, Option<W>>>> FullOuterJoin<K, V, W>(this DStream<Tuple<K, V>> self, DStream<Tuple<K, W>> other, int numPartitions = 0)
|
||||
{
|
||||
if (numPartitions <= 0)
|
||||
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
|
||||
|
||||
return self.TransformWith<KeyValuePair<K, W>, KeyValuePair<K, Tuple<Option<V>, Option<W>>>>(new FullOuterJoinHelper<K, V, W>(numPartitions).Execute, other);
|
||||
return self.TransformWith<Tuple<K, W>, Tuple<K, Tuple<Option<V>, Option<W>>>>(new FullOuterJoinHelper<K, V, W>(numPartitions).Execute, other);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -227,7 +227,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// </param>
|
||||
/// <param name="numPartitions">Number of partitions of each RDD in the new DStream.</param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, IEnumerable<V>>> GroupByKeyAndWindow<K, V>(this DStream<KeyValuePair<K, V>> self,
|
||||
public static DStream<Tuple<K, IEnumerable<V>>> GroupByKeyAndWindow<K, V>(this DStream<Tuple<K, V>> self,
|
||||
int windowSeconds, int slideSeconds, int numPartitions = 0)
|
||||
{
|
||||
var ls = self.MapValues(x => new List<V> { x });
|
||||
|
@ -259,13 +259,13 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="numPartitions">number of partitions of each RDD in the new DStream.</param>
|
||||
/// <param name="filterFunc">function to filter expired key-value pairs; only pairs that satisfy the function are retained set this to null if you do not want to filter</param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, V>> ReduceByKeyAndWindow<K, V>(this DStream<KeyValuePair<K, V>> self,
|
||||
public static DStream<Tuple<K, V>> ReduceByKeyAndWindow<K, V>(this DStream<Tuple<K, V>> self,
|
||||
Func<V, V, V> reduceFunc,
|
||||
Func<V, V, V> invReduceFunc,
|
||||
int windowSeconds,
|
||||
int slideSeconds = 0,
|
||||
int numPartitions = 0,
|
||||
Func<KeyValuePair<K, V>, bool> filterFunc = null)
|
||||
Func<Tuple<K, V>, bool> filterFunc = null)
|
||||
{
|
||||
self.ValidateWindowParam(windowSeconds, slideSeconds);
|
||||
|
||||
|
@ -294,7 +294,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
formatter.Serialize(invStream, invReduceF);
|
||||
}
|
||||
|
||||
return new DStream<KeyValuePair<K, V>>(
|
||||
return new DStream<Tuple<K, V>>(
|
||||
SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpReducedWindowedDStream(
|
||||
reduced.DStreamProxy,
|
||||
stream.ToArray(),
|
||||
|
@ -321,8 +321,8 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="initialState">Initial state value of each key</param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, S>> UpdateStateByKey<K, V, S>(this DStream<KeyValuePair<K, V>> self,
|
||||
Func<IEnumerable<V>, S, S> updateFunc, RDD<KeyValuePair<K, S>> initialState = null,
|
||||
public static DStream<Tuple<K, S>> UpdateStateByKey<K, V, S>(this DStream<Tuple<K, V>> self,
|
||||
Func<IEnumerable<V>, S, S> updateFunc, RDD<Tuple<K, S>> initialState = null,
|
||||
int numPartitions = 0)
|
||||
{
|
||||
return UpdateStateByKey<K, V, S>(self, new UpdateStateByKeyHelper<K, V, S>(updateFunc).Execute, initialState, numPartitions);
|
||||
|
@ -340,11 +340,11 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="initialState">Initial state value of each key</param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, S>> UpdateStateByKey<K, V, S>(this DStream<KeyValuePair<K, V>> self,
|
||||
Func<IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> updateFunc, RDD<KeyValuePair<K, S>> initialState = null,
|
||||
public static DStream<Tuple<K, S>> UpdateStateByKey<K, V, S>(this DStream<Tuple<K, V>> self,
|
||||
Func<IEnumerable<Tuple<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<Tuple<K, S>>> updateFunc, RDD<Tuple<K, S>> initialState = null,
|
||||
int numPartitions = 0)
|
||||
{
|
||||
return UpdateStateByKey<K, V, S>(self, new MapPartitionsHelper<KeyValuePair<K, Tuple<IEnumerable<V>, S>>, KeyValuePair<K, S>>(updateFunc).Execute, initialState, numPartitions);
|
||||
return UpdateStateByKey<K, V, S>(self, new MapPartitionsHelper<Tuple<K, Tuple<IEnumerable<V>, S>>, Tuple<K, S>>(updateFunc).Execute, initialState, numPartitions);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -359,9 +359,9 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <param name="initialState">Initial state value of each key</param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, S>> UpdateStateByKey<K, V, S>(this DStream<KeyValuePair<K, V>> self,
|
||||
Func<int, IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> updateFunc,
|
||||
RDD<KeyValuePair<K, S>> initialState = null, int numPartitions = 0)
|
||||
public static DStream<Tuple<K, S>> UpdateStateByKey<K, V, S>(this DStream<Tuple<K, V>> self,
|
||||
Func<int, IEnumerable<Tuple<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<Tuple<K, S>>> updateFunc,
|
||||
RDD<Tuple<K, S>> initialState = null, int numPartitions = 0)
|
||||
{
|
||||
if (numPartitions <= 0)
|
||||
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
|
||||
|
@ -377,7 +377,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
var stream = new MemoryStream();
|
||||
formatter.Serialize(stream, func);
|
||||
|
||||
return new DStream<KeyValuePair<K, S>>(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpStateDStream(
|
||||
return new DStream<Tuple<K, S>>(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpStateDStream(
|
||||
ds.DStreamProxy,
|
||||
stream.ToArray(),
|
||||
"CSharpStateDStream",
|
||||
|
@ -390,14 +390,14 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// Return a new "state" DStream where the state for each key is updated by applying
|
||||
/// the given function on the previous state of the key and the new values of the key.
|
||||
/// </summary>
|
||||
public static MapWithStateDStream<K, V, S, M> MapWithState<K, V, S, M>(this DStream<KeyValuePair<K, V>> self, StateSpec<K, V, S, M> stateSpec)
|
||||
public static MapWithStateDStream<K, V, S, M> MapWithState<K, V, S, M>(this DStream<Tuple<K, V>> self, StateSpec<K, V, S, M> stateSpec)
|
||||
{
|
||||
if (stateSpec.numPartitions <= 0)
|
||||
{
|
||||
stateSpec = stateSpec.NumPartitions(self.streamingContext.SparkContext.DefaultParallelism);
|
||||
}
|
||||
|
||||
Func<double, RDD<dynamic>, RDD<dynamic>> prevFunc = self.Piplinable ? (self as TransformedDStream<KeyValuePair<K, V>>).func : null;
|
||||
Func<double, RDD<dynamic>, RDD<dynamic>> prevFunc = self.Piplinable ? (self as TransformedDStream<Tuple<K, V>>).func : null;
|
||||
|
||||
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> func = new MapWithStateHelper<K, V, S, M>(prevFunc, stateSpec).Execute;
|
||||
|
||||
|
@ -414,8 +414,8 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
self.streamingContext);
|
||||
|
||||
DStream<M> mappedDataDStream = mapWithStateDStream.FlatMap(r => r.mappedData);
|
||||
DStream<KeyValuePair<K, S>> snapshotsDStream = mapWithStateDStream.FlatMap(
|
||||
r => r.stateMap.Select(entry => new KeyValuePair<K, S>(entry.Key, entry.Value.state)));
|
||||
DStream<Tuple<K, S>> snapshotsDStream = mapWithStateDStream.FlatMap(
|
||||
r => r.stateMap.Select(entry => new Tuple<K, S>(entry.Key, entry.Value.state)));
|
||||
|
||||
return new MapWithStateDStream<K, V, S, M>(mappedDataDStream, snapshotsDStream);
|
||||
}
|
||||
|
@ -443,7 +443,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
this.numPartitions = numPartitions;
|
||||
}
|
||||
|
||||
internal RDD<KeyValuePair<K, C>> Execute(RDD<KeyValuePair<K, V>> rdd)
|
||||
internal RDD<Tuple<K, C>> Execute(RDD<Tuple<K, V>> rdd)
|
||||
{
|
||||
return rdd.CombineByKey(createCombiner, mergeValue, mergeCombiners, numPartitions);
|
||||
}
|
||||
|
@ -458,7 +458,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
this.numPartitions = numPartitions;
|
||||
}
|
||||
|
||||
internal RDD<KeyValuePair<K, V>> Execute(RDD<KeyValuePair<K, V>> rdd)
|
||||
internal RDD<Tuple<K, V>> Execute(RDD<Tuple<K, V>> rdd)
|
||||
{
|
||||
return rdd.PartitionBy(numPartitions);
|
||||
}
|
||||
|
@ -473,7 +473,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
this.numPartitions = numPartitions;
|
||||
}
|
||||
|
||||
internal RDD<byte[]> Execute(RDD<KeyValuePair<K, V>> rdd)
|
||||
internal RDD<byte[]> Execute(RDD<Tuple<K, V>> rdd)
|
||||
{
|
||||
var keyed = rdd.MapPartitionsWithIndex(new PairRDDFunctions.AddShuffleKeyHelper<K, V>(numPartitions).Execute, true);
|
||||
keyed.bypassSerializer = true;
|
||||
|
@ -492,9 +492,9 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
func = f;
|
||||
}
|
||||
|
||||
internal KeyValuePair<K, U> Execute(KeyValuePair<K, V> kvp)
|
||||
internal Tuple<K, U> Execute(Tuple<K, V> kvp)
|
||||
{
|
||||
return new KeyValuePair<K, U>(kvp.Key, func(kvp.Value));
|
||||
return new Tuple<K, U>(kvp.Item1, func(kvp.Item2));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -507,9 +507,9 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
func = f;
|
||||
}
|
||||
|
||||
internal IEnumerable<KeyValuePair<K, U>> Execute(KeyValuePair<K, V> kvp)
|
||||
internal IEnumerable<Tuple<K, U>> Execute(Tuple<K, V> kvp)
|
||||
{
|
||||
return func(kvp.Value).Select(v => new KeyValuePair<K, U>(kvp.Key, v));
|
||||
return func(kvp.Item2).Select(v => new Tuple<K, U>(kvp.Item1, v));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -522,7 +522,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
this.numPartitions = numPartitions;
|
||||
}
|
||||
|
||||
internal RDD<KeyValuePair<K, List<V>>> Execute(RDD<KeyValuePair<K, V>> rdd)
|
||||
internal RDD<Tuple<K, List<V>>> Execute(RDD<Tuple<K, V>> rdd)
|
||||
{
|
||||
return rdd.GroupByKey(numPartitions);
|
||||
}
|
||||
|
@ -537,7 +537,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
this.numPartitions = numPartitions;
|
||||
}
|
||||
|
||||
internal RDD<KeyValuePair<K, Tuple<List<V>, List<W>>>> Execute(RDD<KeyValuePair<K, V>> l, RDD<KeyValuePair<K, W>> r)
|
||||
internal RDD<Tuple<K, Tuple<List<V>, List<W>>>> Execute(RDD<Tuple<K, V>> l, RDD<Tuple<K, W>> r)
|
||||
{
|
||||
return l.GroupWith<K, V, W>(r, numPartitions);
|
||||
}
|
||||
|
@ -552,7 +552,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
this.numPartitions = numPartitions;
|
||||
}
|
||||
|
||||
internal RDD<KeyValuePair<K, Tuple<V, W>>> Execute(RDD<KeyValuePair<K, V>> l, RDD<KeyValuePair<K, W>> r)
|
||||
internal RDD<Tuple<K, Tuple<V, W>>> Execute(RDD<Tuple<K, V>> l, RDD<Tuple<K, W>> r)
|
||||
{
|
||||
return l.Join<K, V, W>(r, numPartitions);
|
||||
}
|
||||
|
@ -567,7 +567,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
this.numPartitions = numPartitions;
|
||||
}
|
||||
|
||||
internal RDD<KeyValuePair<K, Tuple<V, Option<W>>>> Execute(RDD<KeyValuePair<K, V>> l, RDD<KeyValuePair<K, W>> r)
|
||||
internal RDD<Tuple<K, Tuple<V, Option<W>>>> Execute(RDD<Tuple<K, V>> l, RDD<Tuple<K, W>> r)
|
||||
{
|
||||
return l.LeftOuterJoin<K, V, W>(r, numPartitions);
|
||||
}
|
||||
|
@ -582,7 +582,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
this.numPartitions = numPartitions;
|
||||
}
|
||||
|
||||
internal RDD<KeyValuePair<K, Tuple<Option<V>, W>>> Execute(RDD<KeyValuePair<K, V>> l, RDD<KeyValuePair<K, W>> r)
|
||||
internal RDD<Tuple<K, Tuple<Option<V>, W>>> Execute(RDD<Tuple<K, V>> l, RDD<Tuple<K, W>> r)
|
||||
{
|
||||
return l.RightOuterJoin<K, V, W>(r, numPartitions);
|
||||
}
|
||||
|
@ -597,7 +597,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
this.numPartitions = numPartitions;
|
||||
}
|
||||
|
||||
internal RDD<KeyValuePair<K, Tuple<Option<V>, Option<W>>>> Execute(RDD<KeyValuePair<K, V>> l, RDD<KeyValuePair<K, W>> r)
|
||||
internal RDD<Tuple<K, Tuple<Option<V>, Option<W>>>> Execute(RDD<Tuple<K, V>> l, RDD<Tuple<K, W>> r)
|
||||
{
|
||||
return l.FullOuterJoin<K, V, W>(r, numPartitions);
|
||||
}
|
||||
|
@ -609,12 +609,12 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
private readonly Func<V, V, V> reduceFunc;
|
||||
private readonly Func<V, V, V> invReduceFunc;
|
||||
private readonly int numPartitions;
|
||||
private readonly Func<KeyValuePair<K, V>, bool> filterFunc;
|
||||
private readonly Func<Tuple<K, V>, bool> filterFunc;
|
||||
|
||||
internal ReduceByKeyAndWindowHelper(Func<V, V, V> reduceF,
|
||||
Func<V, V, V> invReduceF,
|
||||
int numPartitions,
|
||||
Func<KeyValuePair<K, V>, bool> filterF)
|
||||
Func<Tuple<K, V>, bool> filterF)
|
||||
{
|
||||
reduceFunc = reduceF;
|
||||
invReduceFunc = invReduceF;
|
||||
|
@ -625,11 +625,11 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
internal RDD<dynamic> Reduce(double t, RDD<dynamic> a, RDD<dynamic> b)
|
||||
{
|
||||
b.partitioner = new Partitioner(numPartitions, null);
|
||||
var r = b.ConvertTo<KeyValuePair<K, V>>();
|
||||
var r = b.ConvertTo<Tuple<K, V>>();
|
||||
if (a != null)
|
||||
{
|
||||
a.partitioner = b.partitioner;
|
||||
r = a.ConvertTo<KeyValuePair<K, V>>().Union(r);
|
||||
r = a.ConvertTo<Tuple<K, V>>().Union(r);
|
||||
}
|
||||
r = r.ReduceByKey<K, V>(reduceFunc, numPartitions);
|
||||
if (filterFunc != null)
|
||||
|
@ -640,8 +640,8 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
internal RDD<dynamic> InvReduce(double t, RDD<dynamic> a, RDD<dynamic> b)
|
||||
{
|
||||
a.partitioner = b.partitioner = new Partitioner(numPartitions, null);
|
||||
var rddb = b.ConvertTo<KeyValuePair<K, V>>().ReduceByKey<K, V>(reduceFunc, numPartitions);
|
||||
var rdda = a.ConvertTo<KeyValuePair<K, V>>();
|
||||
var rddb = b.ConvertTo<Tuple<K, V>>().ReduceByKey<K, V>(reduceFunc, numPartitions);
|
||||
var rdda = a.ConvertTo<Tuple<K, V>>();
|
||||
var joined = rdda.Join<K, V, V>(rddb, numPartitions);
|
||||
var r = joined.MapValues<K, Tuple<V, V>, V>(kv => kv.Item2 != null ? invReduceFunc(kv.Item1, kv.Item2) : kv.Item1);
|
||||
return r.ConvertTo<dynamic>();
|
||||
|
@ -658,21 +658,21 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
func = f;
|
||||
}
|
||||
|
||||
internal IEnumerable<KeyValuePair<K, S>> Execute(IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>> input)
|
||||
internal IEnumerable<Tuple<K, S>> Execute(IEnumerable<Tuple<K, Tuple<IEnumerable<V>, S>>> input)
|
||||
{
|
||||
return input.Select(x => new KeyValuePair<K, S>(x.Key, func(x.Value.Item1, x.Value.Item2)));
|
||||
return input.Select(x => new Tuple<K, S>(x.Item1, func(x.Item2.Item1, x.Item2.Item2)));
|
||||
}
|
||||
}
|
||||
|
||||
[Serializable]
|
||||
internal class UpdateStateByKeysHelper<K, V, S>
|
||||
{
|
||||
private readonly Func<int, IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> func;
|
||||
private readonly RDD<KeyValuePair<K, S>> initialState;
|
||||
private readonly Func<int, IEnumerable<Tuple<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<Tuple<K, S>>> func;
|
||||
private readonly RDD<Tuple<K, S>> initialState;
|
||||
private readonly int numPartitions;
|
||||
internal UpdateStateByKeysHelper(
|
||||
Func<int, IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> f,
|
||||
RDD<KeyValuePair<K, S>> initialState, int numPartitions)
|
||||
Func<int, IEnumerable<Tuple<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<Tuple<K, S>>> f,
|
||||
RDD<Tuple<K, S>> initialState, int numPartitions)
|
||||
{
|
||||
func = f;
|
||||
this.initialState = initialState;
|
||||
|
@ -681,11 +681,11 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
|
||||
internal RDD<dynamic> Execute(double t, RDD<dynamic> stateRDD, RDD<dynamic> valuesRDD)
|
||||
{
|
||||
RDD<KeyValuePair<K, S>> state = null;
|
||||
RDD<KeyValuePair<K, Tuple<IEnumerable<V>, S>>> g = null;
|
||||
RDD<Tuple<K, S>> state = null;
|
||||
RDD<Tuple<K, Tuple<IEnumerable<V>, S>>> g = null;
|
||||
|
||||
// call into scala side partitionBy directly since AddShuffleKey already applied
|
||||
var values = new RDD<KeyValuePair<K, V>>(valuesRDD.sparkContext.SparkContextProxy.CreatePairwiseRDD(valuesRDD.rddProxy, numPartitions, 0), valuesRDD.sparkContext);
|
||||
var values = new RDD<Tuple<K, V>>(valuesRDD.sparkContext.SparkContextProxy.CreatePairwiseRDD(valuesRDD.rddProxy, numPartitions, 0), valuesRDD.sparkContext);
|
||||
values.partitioner = new Partitioner(numPartitions, null);
|
||||
|
||||
if (stateRDD == null)
|
||||
|
@ -706,12 +706,12 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
}
|
||||
else
|
||||
{
|
||||
state = stateRDD.ConvertTo<KeyValuePair<K, S>>();
|
||||
state = stateRDD.ConvertTo<Tuple<K, S>>();
|
||||
state.partitioner = values.partitioner;
|
||||
g = state.GroupWith(values, numPartitions).MapValues(x => new Tuple<IEnumerable<V>, S>(new List<V>(x.Item2), x.Item1.Count > 0 ? x.Item1[0] : default(S)));
|
||||
}
|
||||
|
||||
state = g.MapPartitionsWithIndex((pid, iter) => func(pid, iter), true).Filter(x => x.Value != null);
|
||||
state = g.MapPartitionsWithIndex((pid, iter) => func(pid, iter), true).Filter(x => x.Item2 != null);
|
||||
|
||||
return state.ConvertTo<dynamic>();
|
||||
}
|
||||
|
|
|
@ -451,9 +451,9 @@
|
|||
a function to sort the key.
|
||||
</summary>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.OrderedRDDFunctions.SortByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Boolean,System.Nullable{System.Int32})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.OrderedRDDFunctions.SortByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Boolean,System.Nullable{System.Int32})">
|
||||
<summary>
|
||||
Sorts this RDD, which is assumed to consist of KeyValuePair pairs.
|
||||
Sorts this RDD, which is assumed to consist of Tuple pairs.
|
||||
</summary>
|
||||
<typeparam name="K"></typeparam>
|
||||
<typeparam name="V"></typeparam>
|
||||
|
@ -462,9 +462,9 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.OrderedRDDFunctions.SortByKey``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Boolean,System.Nullable{System.Int32},System.Func{``0,``2})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.OrderedRDDFunctions.SortByKey``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Boolean,System.Nullable{System.Int32},System.Func{``0,``2})">
|
||||
<summary>
|
||||
Sorts this RDD, which is assumed to consist of KeyValuePairs. If key is type of string, case is sensitive.
|
||||
Sorts this RDD, which is assumed to consist of Tuples. If Item1 is type of string, case is sensitive.
|
||||
</summary>
|
||||
<typeparam name="K"></typeparam>
|
||||
<typeparam name="V"></typeparam>
|
||||
|
@ -472,10 +472,10 @@
|
|||
<param name="self"></param>
|
||||
<param name="ascending"></param>
|
||||
<param name="numPartitions">Number of partitions. Each partition of the sorted RDD contains a sorted range of the elements.</param>
|
||||
<param name="keyFunc">RDD will sort by keyFunc(key) for every key in KeyValuePair. Must not be null.</param>
|
||||
<param name="keyFunc">RDD will sort by keyFunc(key) for every Item1 in Tuple. Must not be null.</param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.OrderedRDDFunctions.repartitionAndSortWithinPartitions``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Nullable{System.Int32},System.Func{``0,System.Int32},System.Boolean)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.OrderedRDDFunctions.repartitionAndSortWithinPartitions``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Nullable{System.Int32},System.Func{``0,System.Int32},System.Boolean)">
|
||||
<summary>
|
||||
Repartition the RDD according to the given partitioner and, within each resulting partition,
|
||||
sort records by their keys.
|
||||
|
@ -493,16 +493,16 @@
|
|||
</member>
|
||||
<member name="T:Microsoft.Spark.CSharp.Core.PairRDDFunctions">
|
||||
<summary>
|
||||
operations only available to KeyValuePair RDD
|
||||
operations only available to Tuple RDD
|
||||
|
||||
See also http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions
|
||||
</summary>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.CollectAsMap``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.CollectAsMap``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}})">
|
||||
<summary>
|
||||
Return the key-value pairs in this RDD to the master as a dictionary.
|
||||
|
||||
var m = sc.Parallelize(new[] { new KeyValuePair<int, int>(1, 2), new KeyValuePair<int, int>(3, 4) }, 1).CollectAsMap()
|
||||
var m = sc.Parallelize(new[] { new Tuple<int, int>(1, 2), new Tuple<int, int>(3, 4) }, 1).CollectAsMap()
|
||||
m[1]
|
||||
2
|
||||
m[3]
|
||||
|
@ -514,11 +514,11 @@
|
|||
<param name="self"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.Keys``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.Keys``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}})">
|
||||
<summary>
|
||||
Return an RDD with the keys of each tuple.
|
||||
|
||||
>>> m = sc.Parallelize(new[] { new KeyValuePair<int, int>(1, 2), new KeyValuePair<int, int>(3, 4) }, 1).Keys().Collect()
|
||||
>>> m = sc.Parallelize(new[] { new Tuple<int, int>(1, 2), new Tuple<int, int>(3, 4) }, 1).Keys().Collect()
|
||||
[1, 3]
|
||||
</summary>
|
||||
<typeparam name="K"></typeparam>
|
||||
|
@ -526,11 +526,11 @@
|
|||
<param name="self"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.Values``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.Values``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}})">
|
||||
<summary>
|
||||
Return an RDD with the values of each tuple.
|
||||
|
||||
>>> m = sc.Parallelize(new[] { new KeyValuePair<int, int>(1, 2), new KeyValuePair<int, int>(3, 4) }, 1).Values().Collect()
|
||||
>>> m = sc.Parallelize(new[] { new Tuple<int, int>(1, 2), new Tuple<int, int>(3, 4) }, 1).Values().Collect()
|
||||
[2, 4]
|
||||
|
||||
</summary>
|
||||
|
@ -539,7 +539,7 @@
|
|||
<param name="self"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.ReduceByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1,``1,``1},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.ReduceByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Func{``1,``1,``1},System.Int32)">
|
||||
<summary>
|
||||
Merge the values for each key using an associative reduce function.
|
||||
|
||||
|
@ -551,9 +551,9 @@
|
|||
|
||||
sc.Parallelize(new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 1),
|
||||
new KeyValuePair<string, int>("a", 1)
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 1),
|
||||
new Tuple<string, int>("a", 1)
|
||||
}, 2)
|
||||
.ReduceByKey((x, y) => x + y).Collect()
|
||||
|
||||
|
@ -567,7 +567,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.ReduceByKeyLocally``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1,``1,``1})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.ReduceByKeyLocally``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Func{``1,``1,``1})">
|
||||
<summary>
|
||||
Merge the values for each key using an associative reduce function, but
|
||||
return the results immediately to the master as a dictionary.
|
||||
|
@ -577,9 +577,9 @@
|
|||
|
||||
sc.Parallelize(new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 1),
|
||||
new KeyValuePair<string, int>("a", 1)
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 1),
|
||||
new Tuple<string, int>("a", 1)
|
||||
}, 2)
|
||||
.ReduceByKeyLocally((x, y) => x + y).Collect()
|
||||
|
||||
|
@ -592,15 +592,15 @@
|
|||
<param name="reduceFunc"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.CountByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.CountByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}})">
|
||||
<summary>
|
||||
Count the number of elements for each key, and return the result to the master as a dictionary.
|
||||
|
||||
sc.Parallelize(new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 1),
|
||||
new KeyValuePair<string, int>("a", 1)
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 1),
|
||||
new Tuple<string, int>("a", 1)
|
||||
}, 2)
|
||||
.CountByKey((x, y) => x + y).Collect()
|
||||
|
||||
|
@ -612,7 +612,7 @@
|
|||
<param name="self"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.Join``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.Join``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
|
||||
<summary>
|
||||
Return an RDD containing all pairs of elements with matching keys in this RDD and <paramref name="other"/>.
|
||||
|
||||
|
@ -621,9 +621,9 @@
|
|||
Performs a hash join across the cluster.
|
||||
|
||||
var l = sc.Parallelize(
|
||||
new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 1);
|
||||
new[] { new Tuple<string, int>("a", 1), new Tuple<string, int>("b", 4) }, 1);
|
||||
var r = sc.Parallelize(
|
||||
new[] { new KeyValuePair<string, int>("a", 2), new KeyValuePair<string, int>("a", 3) }, 1);
|
||||
new[] { new Tuple<string, int>("a", 2), new Tuple<string, int>("a", 3) }, 1);
|
||||
var m = l.Join(r, 2).Collect();
|
||||
|
||||
[('a', (1, 2)), ('a', (1, 3))]
|
||||
|
@ -637,7 +637,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.LeftOuterJoin``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.LeftOuterJoin``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
|
||||
<summary>
|
||||
Perform a left outer join of this RDD and <paramref name="other"/>.
|
||||
|
||||
|
@ -648,9 +648,9 @@
|
|||
Hash-partitions the resulting RDD into the given number of partitions.
|
||||
|
||||
var l = sc.Parallelize(
|
||||
new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 1);
|
||||
new[] { new Tuple<string, int>("a", 1), new Tuple<string, int>("b", 4) }, 1);
|
||||
var r = sc.Parallelize(
|
||||
new[] { new KeyValuePair<string, int>("a", 2) }, 1);
|
||||
new[] { new Tuple<string, int>("a", 2) }, 1);
|
||||
var m = l.LeftOuterJoin(r).Collect();
|
||||
|
||||
[('a', (1, 2)), ('b', (4, Option))]
|
||||
|
@ -664,7 +664,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.RightOuterJoin``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.RightOuterJoin``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
|
||||
<summary>
|
||||
Perform a right outer join of this RDD and <paramref name="other"/>.
|
||||
|
||||
|
@ -675,9 +675,9 @@
|
|||
Hash-partitions the resulting RDD into the given number of partitions.
|
||||
|
||||
var l = sc.Parallelize(
|
||||
new[] { new KeyValuePair<string, int>("a", 2) }, 1);
|
||||
new[] { new Tuple<string, int>("a", 2) }, 1);
|
||||
var r = sc.Parallelize(
|
||||
new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 1);
|
||||
new[] { new Tuple<string, int>("a", 1), new Tuple<string, int>("b", 4) }, 1);
|
||||
var m = l.RightOuterJoin(r).Collect();
|
||||
|
||||
[('a', (2, 1)), ('b', (Option, 4))]
|
||||
|
@ -691,7 +691,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.FullOuterJoin``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.FullOuterJoin``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
|
||||
<summary>
|
||||
Perform a full outer join of this RDD and <paramref name="other"/>.
|
||||
|
||||
|
@ -706,9 +706,9 @@
|
|||
Hash-partitions the resulting RDD into the given number of partitions.
|
||||
|
||||
var l = sc.Parallelize(
|
||||
new[] { new KeyValuePair<string, int>("a", 1), KeyValuePair<string, int>("b", 4) }, 1);
|
||||
new[] { new Tuple<string, int>("a", 1), Tuple<string, int>("b", 4) }, 1);
|
||||
var r = sc.Parallelize(
|
||||
new[] { new KeyValuePair<string, int>("a", 2), new KeyValuePair<string, int>("c", 8) }, 1);
|
||||
new[] { new Tuple<string, int>("a", 2), new Tuple<string, int>("c", 8) }, 1);
|
||||
var m = l.FullOuterJoin(r).Collect();
|
||||
|
||||
[('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))]
|
||||
|
@ -722,18 +722,18 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.PartitionBy``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Int32,System.Func{System.Object,System.Int32})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.PartitionBy``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Int32,System.Func{System.Object,System.Int32})">
|
||||
<summary>
|
||||
Return a copy of the RDD partitioned using the specified partitioner.
|
||||
|
||||
sc.Parallelize(new[] { 1, 2, 3, 4, 2, 4, 1 }, 1).Map(x => new KeyValuePair<int, int>(x, x)).PartitionBy(3).Glom().Collect()
|
||||
sc.Parallelize(new[] { 1, 2, 3, 4, 2, 4, 1 }, 1).Map(x => new Tuple<int, int>(x, x)).PartitionBy(3).Glom().Collect()
|
||||
</summary>
|
||||
<param name="self"></param>
|
||||
<param name="numPartitions"></param>
|
||||
<param name="partitionFunc"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.CombineByKey``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``2},System.Func{``2,``1,``2},System.Func{``2,``2,``2},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.CombineByKey``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Func{``2},System.Func{``2,``1,``2},System.Func{``2,``2,``2},System.Int32)">
|
||||
<summary>
|
||||
# TODO: add control over map-side aggregation
|
||||
Generic function to combine the elements for each key using a custom
|
||||
|
@ -755,9 +755,9 @@
|
|||
sc.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 1),
|
||||
new KeyValuePair<string, int>("a", 1)
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 1),
|
||||
new Tuple<string, int>("a", 1)
|
||||
}, 2)
|
||||
.CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
|
||||
|
||||
|
@ -773,7 +773,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.AggregateByKey``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``2},System.Func{``2,``1,``2},System.Func{``2,``2,``2},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.AggregateByKey``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Func{``2},System.Func{``2,``1,``2},System.Func{``2,``2,``2},System.Int32)">
|
||||
<summary>
|
||||
Aggregate the values of each key, using given combine functions and a neutral
|
||||
"zero value". This function can return a different result type, U, than the type
|
||||
|
@ -786,9 +786,9 @@
|
|||
sc.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 1),
|
||||
new KeyValuePair<string, int>("a", 1)
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 1),
|
||||
new Tuple<string, int>("a", 1)
|
||||
}, 2)
|
||||
.CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
|
||||
|
||||
|
@ -804,7 +804,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.FoldByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1},System.Func{``1,``1,``1},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.FoldByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Func{``1},System.Func{``1,``1,``1},System.Int32)">
|
||||
<summary>
|
||||
Merge the values for each key using an associative function "func"
|
||||
and a neutral "zeroValue" which may be added to the result an
|
||||
|
@ -814,9 +814,9 @@
|
|||
sc.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 1),
|
||||
new KeyValuePair<string, int>("a", 1)
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 1),
|
||||
new Tuple<string, int>("a", 1)
|
||||
}, 2)
|
||||
.CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
|
||||
|
||||
|
@ -830,7 +830,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.GroupByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.GroupByKey``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Int32)">
|
||||
<summary>
|
||||
Group the values for each key in the RDD into a single sequence.
|
||||
Hash-partitions the resulting RDD with numPartitions partitions.
|
||||
|
@ -842,9 +842,9 @@
|
|||
sc.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 1),
|
||||
new KeyValuePair<string, int>("a", 1)
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 1),
|
||||
new Tuple<string, int>("a", 1)
|
||||
}, 2)
|
||||
.GroupByKey().MapValues(l => string.Join(" ", l)).Collect()
|
||||
|
||||
|
@ -857,7 +857,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.MapValues``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1,``2})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.MapValues``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Func{``1,``2})">
|
||||
<summary>
|
||||
Pass each value in the key-value pair RDD through a map function
|
||||
without changing the keys; this also retains the original RDD's partitioning.
|
||||
|
@ -865,8 +865,8 @@
|
|||
sc.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, string[]>("a", new[]{"apple", "banana", "lemon"}),
|
||||
new KeyValuePair<string, string[]>("b", new[]{"grapes"})
|
||||
new Tuple<string, string[]>("a", new[]{"apple", "banana", "lemon"}),
|
||||
new Tuple<string, string[]>("b", new[]{"grapes"})
|
||||
}, 2)
|
||||
.MapValues(x => x.Length).Collect()
|
||||
|
||||
|
@ -880,7 +880,7 @@
|
|||
<param name="func"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.FlatMapValues``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1,System.Collections.Generic.IEnumerable{``2}})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.FlatMapValues``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Func{``1,System.Collections.Generic.IEnumerable{``2}})">
|
||||
<summary>
|
||||
Pass each value in the key-value pair RDD through a flatMap function
|
||||
without changing the keys; this also retains the original RDD's partitioning.
|
||||
|
@ -888,8 +888,8 @@
|
|||
x = sc.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, string[]>("a", new[]{"x", "y", "z"}),
|
||||
new KeyValuePair<string, string[]>("b", new[]{"p", "r"})
|
||||
new Tuple<string, string[]>("a", new[]{"x", "y", "z"}),
|
||||
new Tuple<string, string[]>("b", new[]{"p", "r"})
|
||||
}, 2)
|
||||
.FlatMapValues(x => x).Collect()
|
||||
|
||||
|
@ -903,9 +903,9 @@
|
|||
<param name="func"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.MapPartitionsWithIndex``5(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,System.Object}})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.MapPartitionsWithIndex``5(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,System.Object}})">
|
||||
<summary>
|
||||
explicitly convert KeyValuePair<K, V> to KeyValuePair<K, dynamic>
|
||||
explicitly convert Tuple<K, V> to Tuple<K, dynamic>
|
||||
since they are incompatibles types unlike V to dynamic
|
||||
</summary>
|
||||
<typeparam name="K"></typeparam>
|
||||
|
@ -916,13 +916,13 @@
|
|||
<param name="self"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.GroupWith``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.GroupWith``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
|
||||
<summary>
|
||||
For each key k in this RDD or <paramref name="other"/>, return a resulting RDD that
|
||||
contains a tuple with the list of values for that key in this RDD as well as <paramref name="other"/>.
|
||||
|
||||
var x = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 2);
|
||||
var y = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 2) }, 1);
|
||||
var x = sc.Parallelize(new[] { new Tuple<string, int>("a", 1), new Tuple<string, int>("b", 4) }, 2);
|
||||
var y = sc.Parallelize(new[] { new Tuple<string, int>("a", 2) }, 1);
|
||||
x.GroupWith(y).Collect();
|
||||
|
||||
[('a', ([1], [2])), ('b', ([4], []))]
|
||||
|
@ -936,11 +936,11 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.GroupWith``4(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``3}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.GroupWith``4(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``3}},System.Int32)">
|
||||
<summary>
|
||||
var x = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 5), new KeyValuePair<string, int>("b", 6) }, 2);
|
||||
var y = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 2);
|
||||
var z = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 2) }, 1);
|
||||
var x = sc.Parallelize(new[] { new Tuple<string, int>("a", 5), new Tuple<string, int>("b", 6) }, 2);
|
||||
var y = sc.Parallelize(new[] { new Tuple<string, int>("a", 1), new Tuple<string, int>("b", 4) }, 2);
|
||||
var z = sc.Parallelize(new[] { new Tuple<string, int>("a", 2) }, 1);
|
||||
x.GroupWith(y, z).Collect();
|
||||
</summary>
|
||||
<typeparam name="K"></typeparam>
|
||||
|
@ -953,12 +953,12 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.GroupWith``5(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``3}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``4}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.GroupWith``5(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``3}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``4}},System.Int32)">
|
||||
<summary>
|
||||
var x = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 5), new KeyValuePair<string, int>("b", 6) }, 2);
|
||||
var y = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 2);
|
||||
var z = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 2) }, 1);
|
||||
var w = sc.Parallelize(new[] { new KeyValuePair<string, int>("b", 42) }, 1);
|
||||
var x = sc.Parallelize(new[] { new Tuple<string, int>("a", 5), new Tuple<string, int>("b", 6) }, 2);
|
||||
var y = sc.Parallelize(new[] { new Tuple<string, int>("a", 1), new Tuple<string, int>("b", 4) }, 2);
|
||||
var z = sc.Parallelize(new[] { new Tuple<string, int>("a", 2) }, 1);
|
||||
var w = sc.Parallelize(new[] { new Tuple<string, int>("b", 42) }, 1);
|
||||
var m = x.GroupWith(y, z, w).MapValues(l => string.Join(" ", l.Item1) + " : " + string.Join(" ", l.Item2) + " : " + string.Join(" ", l.Item3) + " : " + string.Join(" ", l.Item4)).Collect();
|
||||
</summary>
|
||||
<typeparam name="K"></typeparam>
|
||||
|
@ -973,12 +973,12 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SubtractByKey``3(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SubtractByKey``3(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
|
||||
<summary>
|
||||
Return each (key, value) pair in this RDD that has no pair with matching key in <paramref name="other"/>.
|
||||
|
||||
var x = sc.Parallelize(new[] { new KeyValuePair<string, int?>("a", 1), new KeyValuePair<string, int?>("b", 4), new KeyValuePair<string, int?>("b", 5), new KeyValuePair<string, int?>("a", 2) }, 2);
|
||||
var y = sc.Parallelize(new[] { new KeyValuePair<string, int?>("a", 3), new KeyValuePair<string, int?>("c", null) }, 2);
|
||||
var x = sc.Parallelize(new[] { new Tuple<string, int?>("a", 1), new Tuple<string, int?>("b", 4), new Tuple<string, int?>("b", 5), new Tuple<string, int?>("a", 2) }, 2);
|
||||
var y = sc.Parallelize(new[] { new Tuple<string, int?>("a", 3), new Tuple<string, int?>("c", null) }, 2);
|
||||
x.SubtractByKey(y).Collect();
|
||||
|
||||
[('b', 4), ('b', 5)]
|
||||
|
@ -992,14 +992,14 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.Lookup``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},``0)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.Lookup``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},``0)">
|
||||
<summary>
|
||||
Return the list of values in the RDD for key `key`. This operation
|
||||
is done efficiently if the RDD has a known partitioner by only
|
||||
searching the partition that the key maps to.
|
||||
|
||||
>>> l = range(1000)
|
||||
>>> rdd = sc.Parallelize(Enumerable.Range(0, 1000).Zip(Enumerable.Range(0, 1000), (x, y) => new KeyValuePair<int, int>(x, y)), 10)
|
||||
>>> rdd = sc.Parallelize(Enumerable.Range(0, 1000).Zip(Enumerable.Range(0, 1000), (x, y) => new Tuple<int, int>(x, y)), 10)
|
||||
>>> rdd.lookup(42)
|
||||
[42]
|
||||
|
||||
|
@ -1010,7 +1010,7 @@
|
|||
<param name="key"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsNewAPIHadoopDataset``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.String,System.String}})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsNewAPIHadoopDataset``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
|
||||
<summary>
|
||||
Output a Python RDD of key-value pairs (of form RDD[(K, V)]) to any Hadoop file
|
||||
system, using the new Hadoop OutputFormat API (mapreduce package). Keys/values are
|
||||
|
@ -1022,7 +1022,7 @@
|
|||
<param name="self"></param>
|
||||
<param name="conf">Hadoop job configuration, passed in as a dict</param>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsNewAPIHadoopFile``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.String,System.String}})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsNewAPIHadoopFile``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
|
||||
<summary>
|
||||
|
||||
</summary>
|
||||
|
@ -1035,7 +1035,7 @@
|
|||
<param name="valueClass">fully qualified classname of value Writable class (e.g. "org.apache.hadoop.io.Text", None by default)</param>
|
||||
<param name="conf">Hadoop job configuration, passed in as a dict (None by default)</param>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsHadoopDataset``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.String,System.String}})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsHadoopDataset``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
|
||||
<summary>
|
||||
Output a Python RDD of key-value pairs (of form RDD[(K, V)]) to any Hadoop file
|
||||
system, using the old Hadoop OutputFormat API (mapred package). Keys/values are
|
||||
|
@ -1047,7 +1047,7 @@
|
|||
<param name="self"></param>
|
||||
<param name="conf">Hadoop job configuration, passed in as a dict</param>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsHadoopFile``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.String,System.String}},System.String)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsHadoopFile``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}},System.String)">
|
||||
<summary>
|
||||
Output a Python RDD of key-value pairs (of form RDD[(K, V)]) to any Hadoop file
|
||||
system, using the old Hadoop OutputFormat API (mapred package). Key and value types
|
||||
|
@ -1066,7 +1066,7 @@
|
|||
<param name="conf">(None by default)</param>
|
||||
<param name="compressionCodecClass">(None by default)</param>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsSequenceFile``2(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``1}},System.String,System.String)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.PairRDDFunctions.SaveAsSequenceFile``2(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``1}},System.String,System.String)">
|
||||
<summary>
|
||||
Output a Python RDD of key-value pairs (of form RDD[(K, V)]) to any Hadoop file
|
||||
system, using the org.apache.hadoop.io.Writable types that we convert from the
|
||||
|
@ -1169,6 +1169,11 @@
|
|||
Indicates whether the RDD is checkpointed.
|
||||
</summary>
|
||||
</member>
|
||||
<member name="P:Microsoft.Spark.CSharp.Core.RDD`1.SparkContext">
|
||||
<summary>
|
||||
Return the SparkContext that created this RDD
|
||||
</summary>
|
||||
</member>
|
||||
<member name="P:Microsoft.Spark.CSharp.Core.RDD`1.IsCached">
|
||||
<summary>
|
||||
Return whether this RDD has been cached or not
|
||||
|
@ -1231,7 +1236,7 @@
|
|||
<summary>
|
||||
Return a new RDD by applying a function to each element of this RDD.
|
||||
|
||||
sc.Parallelize(new string[]{"b", "a", "c"}, 1).Map(x => new KeyValuePair<string, int>(x, 1)).Collect()
|
||||
sc.Parallelize(new string[]{"b", "a", "c"}, 1).Map(x => new Tuple<string, int>(x, 1)).Collect()
|
||||
[('a', 1), ('b', 1), ('c', 1)]
|
||||
|
||||
</summary>
|
||||
|
@ -2132,7 +2137,7 @@
|
|||
|
||||
Do
|
||||
{{{
|
||||
RDD<KeyValuePair<string, string>> rdd = sparkContext.WholeTextFiles("hdfs://a-hdfs-path")
|
||||
RDD<Tuple<string, string>> rdd = sparkContext.WholeTextFiles("hdfs://a-hdfs-path")
|
||||
}}}
|
||||
|
||||
then `rdd` contains
|
||||
|
@ -2167,7 +2172,7 @@
|
|||
}}}
|
||||
|
||||
Do
|
||||
RDD<KeyValuePair<string, byte[]>>"/> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`,
|
||||
RDD<Tuple<string, byte[]>>"/> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`,
|
||||
|
||||
then `rdd` contains
|
||||
{{{
|
||||
|
@ -2206,7 +2211,7 @@
|
|||
<param name="minSplits">minimum splits in dataset (default min(2, sc.defaultParallelism))</param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.NewAPIHadoopFile(System.String,System.String,System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.String,System.String}})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.NewAPIHadoopFile(System.String,System.String,System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
|
||||
<summary>
|
||||
Read a 'new API' Hadoop InputFormat with arbitrary key and value class from HDFS,
|
||||
a local file system (available on all nodes), or any Hadoop-supported file system URI.
|
||||
|
@ -2224,7 +2229,7 @@
|
|||
<param name="conf"> Hadoop configuration, passed in as a dict (None by default)</param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.NewAPIHadoopRDD(System.String,System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.String,System.String}})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.NewAPIHadoopRDD(System.String,System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
|
||||
<summary>
|
||||
Read a 'new API' Hadoop InputFormat with arbitrary key and value class, from an arbitrary
|
||||
Hadoop configuration, which is passed in as a Python dict.
|
||||
|
@ -2240,7 +2245,7 @@
|
|||
<param name="conf">Hadoop configuration, passed in as a dict (None by default)</param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.HadoopFile(System.String,System.String,System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.String,System.String}})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.HadoopFile(System.String,System.String,System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
|
||||
<summary>
|
||||
Read an 'old' Hadoop InputFormat with arbitrary key and value class from HDFS,
|
||||
a local file system (available on all nodes), or any Hadoop-supported file system URI.
|
||||
|
@ -2258,7 +2263,7 @@
|
|||
<param name="conf">Hadoop configuration, passed in as a dict (None by default)</param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.HadoopRDD(System.String,System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.String,System.String}})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.HadoopRDD(System.String,System.String,System.String,System.String,System.String,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
|
||||
<summary>
|
||||
Read an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary
|
||||
Hadoop configuration, which is passed in as a Python dict.
|
||||
|
@ -2391,6 +2396,14 @@
|
|||
</summary>
|
||||
<param name="logLevel"></param>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.RunJob``1(Microsoft.Spark.CSharp.Core.RDD{``0},System.Collections.Generic.IEnumerable{System.Int32})">
|
||||
<summary>
|
||||
Run a job on a given set of partitions of an RDD.
|
||||
</summary>
|
||||
<typeparam name="T"></typeparam>
|
||||
<param name="rdd"></param>
|
||||
<param name="partitions"></param>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Core.SparkContext.CancelJobGroup(System.String)">
|
||||
<summary>
|
||||
Cancel active jobs for the specified group. See <see cref="M:Microsoft.Spark.CSharp.Core.SparkContext.SetJobGroup(System.String,System.String,System.Boolean)"/> for more information.
|
||||
|
@ -7662,6 +7675,171 @@
|
|||
<param name="json">The Json object used to construct a StructType</param>
|
||||
<returns>A new StructType instance</returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``1(System.String,System.Func{``0})">
|
||||
<summary>
|
||||
Register UDF with no input argument, e.g:
|
||||
SqlContext.RegisterFunction<bool>("MyFilter", () => true);
|
||||
sqlContext.Sql("SELECT * FROM MyTable where MyFilter()");
|
||||
</summary>
|
||||
<typeparam name="RT"></typeparam>
|
||||
<param name="name"></param>
|
||||
<param name="f"></param>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``2(System.String,System.Func{``1,``0})">
|
||||
<summary>
|
||||
Register UDF with 1 input argument, e.g:
|
||||
SqlContext.RegisterFunction<bool, string>("MyFilter", (arg1) => arg1 != null);
|
||||
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1)");
|
||||
</summary>
|
||||
<typeparam name="RT"></typeparam>
|
||||
<typeparam name="A1"></typeparam>
|
||||
<param name="name"></param>
|
||||
<param name="f"></param>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``3(System.String,System.Func{``1,``2,``0})">
|
||||
<summary>
|
||||
Register UDF with 2 input arguments, e.g:
|
||||
SqlContext.RegisterFunction<bool, string, string>("MyFilter", (arg1, arg2) => arg1 != null && arg2 != null);
|
||||
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2)");
|
||||
</summary>
|
||||
<typeparam name="RT"></typeparam>
|
||||
<typeparam name="A1"></typeparam>
|
||||
<typeparam name="A2"></typeparam>
|
||||
<param name="name"></param>
|
||||
<param name="f"></param>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``4(System.String,System.Func{``1,``2,``3,``0})">
|
||||
<summary>
|
||||
Register UDF with 3 input arguments, e.g:
|
||||
SqlContext.RegisterFunction<bool, string, string, string>("MyFilter", (arg1, arg2, arg3) => arg1 != null && arg2 != null && arg3 != null);
|
||||
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, columnName3)");
|
||||
</summary>
|
||||
<typeparam name="RT"></typeparam>
|
||||
<typeparam name="A1"></typeparam>
|
||||
<typeparam name="A2"></typeparam>
|
||||
<typeparam name="A3"></typeparam>
|
||||
<param name="name"></param>
|
||||
<param name="f"></param>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``5(System.String,System.Func{``1,``2,``3,``4,``0})">
|
||||
<summary>
|
||||
Register UDF with 4 input arguments, e.g:
|
||||
SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg4) => arg1 != null && arg2 != null && ... && arg3 != null);
|
||||
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName4)");
|
||||
</summary>
|
||||
<typeparam name="RT"></typeparam>
|
||||
<typeparam name="A1"></typeparam>
|
||||
<typeparam name="A2"></typeparam>
|
||||
<typeparam name="A3"></typeparam>
|
||||
<typeparam name="A4"></typeparam>
|
||||
<param name="name"></param>
|
||||
<param name="f"></param>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``6(System.String,System.Func{``1,``2,``3,``4,``5,``0})">
|
||||
<summary>
|
||||
Register UDF with 5 input arguments, e.g:
|
||||
SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg5) => arg1 != null && arg2 != null && ... && arg5 != null);
|
||||
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName5)");
|
||||
</summary>
|
||||
<typeparam name="RT"></typeparam>
|
||||
<typeparam name="A1"></typeparam>
|
||||
<typeparam name="A2"></typeparam>
|
||||
<typeparam name="A3"></typeparam>
|
||||
<typeparam name="A4"></typeparam>
|
||||
<typeparam name="A5"></typeparam>
|
||||
<param name="name"></param>
|
||||
<param name="f"></param>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``7(System.String,System.Func{``1,``2,``3,``4,``5,``6,``0})">
|
||||
<summary>
|
||||
Register UDF with 6 input arguments, e.g:
|
||||
SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg6) => arg1 != null && arg2 != null && ... && arg6 != null);
|
||||
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName6)");
|
||||
</summary>
|
||||
<typeparam name="RT"></typeparam>
|
||||
<typeparam name="A1"></typeparam>
|
||||
<typeparam name="A2"></typeparam>
|
||||
<typeparam name="A3"></typeparam>
|
||||
<typeparam name="A4"></typeparam>
|
||||
<typeparam name="A5"></typeparam>
|
||||
<typeparam name="A6"></typeparam>
|
||||
<param name="name"></param>
|
||||
<param name="f"></param>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``8(System.String,System.Func{``1,``2,``3,``4,``5,``6,``7,``0})">
|
||||
<summary>
|
||||
Register UDF with 7 input arguments, e.g:
|
||||
SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg7) => arg1 != null && arg2 != null && ... && arg7 != null);
|
||||
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName7)");
|
||||
</summary>
|
||||
<typeparam name="RT"></typeparam>
|
||||
<typeparam name="A1"></typeparam>
|
||||
<typeparam name="A2"></typeparam>
|
||||
<typeparam name="A3"></typeparam>
|
||||
<typeparam name="A4"></typeparam>
|
||||
<typeparam name="A5"></typeparam>
|
||||
<typeparam name="A6"></typeparam>
|
||||
<typeparam name="A7"></typeparam>
|
||||
<param name="name"></param>
|
||||
<param name="f"></param>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``9(System.String,System.Func{``1,``2,``3,``4,``5,``6,``7,``8,``0})">
|
||||
<summary>
|
||||
Register UDF with 8 input arguments, e.g:
|
||||
SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg8) => arg1 != null && arg2 != null && ... && arg8 != null);
|
||||
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName8)");
|
||||
</summary>
|
||||
<typeparam name="RT"></typeparam>
|
||||
<typeparam name="A1"></typeparam>
|
||||
<typeparam name="A2"></typeparam>
|
||||
<typeparam name="A3"></typeparam>
|
||||
<typeparam name="A4"></typeparam>
|
||||
<typeparam name="A5"></typeparam>
|
||||
<typeparam name="A6"></typeparam>
|
||||
<typeparam name="A7"></typeparam>
|
||||
<typeparam name="A8"></typeparam>
|
||||
<param name="name"></param>
|
||||
<param name="f"></param>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``10(System.String,System.Func{``1,``2,``3,``4,``5,``6,``7,``8,``9,``0})">
|
||||
<summary>
|
||||
Register UDF with 9 input arguments, e.g:
|
||||
SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg9) => arg1 != null && arg2 != null && ... && arg9 != null);
|
||||
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName9)");
|
||||
</summary>
|
||||
<typeparam name="RT"></typeparam>
|
||||
<typeparam name="A1"></typeparam>
|
||||
<typeparam name="A2"></typeparam>
|
||||
<typeparam name="A3"></typeparam>
|
||||
<typeparam name="A4"></typeparam>
|
||||
<typeparam name="A5"></typeparam>
|
||||
<typeparam name="A6"></typeparam>
|
||||
<typeparam name="A7"></typeparam>
|
||||
<typeparam name="A8"></typeparam>
|
||||
<typeparam name="A9"></typeparam>
|
||||
<param name="name"></param>
|
||||
<param name="f"></param>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Sql.UdfRegistration.RegisterFunction``11(System.String,System.Func{``1,``2,``3,``4,``5,``6,``7,``8,``9,``10,``0})">
|
||||
<summary>
|
||||
Register UDF with 10 input arguments, e.g:
|
||||
SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg10) => arg1 != null && arg2 != null && ... && arg10 != null);
|
||||
sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName10)");
|
||||
</summary>
|
||||
<typeparam name="RT"></typeparam>
|
||||
<typeparam name="A1"></typeparam>
|
||||
<typeparam name="A2"></typeparam>
|
||||
<typeparam name="A3"></typeparam>
|
||||
<typeparam name="A4"></typeparam>
|
||||
<typeparam name="A5"></typeparam>
|
||||
<typeparam name="A6"></typeparam>
|
||||
<typeparam name="A7"></typeparam>
|
||||
<typeparam name="A8"></typeparam>
|
||||
<typeparam name="A9"></typeparam>
|
||||
<typeparam name="A10"></typeparam>
|
||||
<param name="name"></param>
|
||||
<param name="f"></param>
|
||||
</member>
|
||||
<member name="T:Microsoft.Spark.CSharp.Streaming.ConstantInputDStream`1">
|
||||
<summary>
|
||||
An input stream that always returns the same RDD on each timestep. Useful for testing.
|
||||
|
@ -7968,7 +8146,7 @@
|
|||
Utility for creating streams from
|
||||
</summary>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.EventHubsUtils.CreateUnionStream(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.Dictionary{System.String,System.String},Microsoft.Spark.CSharp.Core.StorageLevelType)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.EventHubsUtils.CreateUnionStream(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}},Microsoft.Spark.CSharp.Core.StorageLevelType)">
|
||||
<summary>
|
||||
Create a unioned EventHubs stream that receives data from Microsoft Azure Eventhubs
|
||||
The unioned stream will receive message from all partitions of the EventHubs
|
||||
|
@ -7998,7 +8176,7 @@
|
|||
Utils for Kafka input stream.
|
||||
</summary>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateStream(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.String,System.String,System.Collections.Generic.Dictionary{System.String,System.Int32},System.Collections.Generic.Dictionary{System.String,System.String})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateStream(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.String,System.String,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.Int32}},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
|
||||
<summary>
|
||||
Create an input stream that pulls messages from a Kafka Broker.
|
||||
</summary>
|
||||
|
@ -8009,7 +8187,7 @@
|
|||
<param name="kafkaParams">Additional params for Kafka</param>
|
||||
<returns>A DStream object</returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateStream(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.String,System.String,System.Collections.Generic.Dictionary{System.String,System.Int32},System.Collections.Generic.Dictionary{System.String,System.String},Microsoft.Spark.CSharp.Core.StorageLevelType)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateStream(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.String,System.String,System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.Int32}},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}},Microsoft.Spark.CSharp.Core.StorageLevelType)">
|
||||
<summary>
|
||||
Create an input stream that pulls messages from a Kafka Broker.
|
||||
</summary>
|
||||
|
@ -8021,7 +8199,7 @@
|
|||
<param name="storageLevelType">RDD storage level.</param>
|
||||
<returns>A DStream object</returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateDirectStream(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.List{System.String},System.Collections.Generic.Dictionary{System.String,System.String},System.Collections.Generic.Dictionary{System.String,System.Int64})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateDirectStream(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.List{System.String},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.Int64}})">
|
||||
<summary>
|
||||
Create an input stream that directly pulls messages from a Kafka Broker and specific offset.
|
||||
|
||||
|
@ -8047,7 +8225,7 @@
|
|||
<param name="fromOffsets">Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream.</param>
|
||||
<returns>A DStream object</returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateDirectStream``1(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.List{System.String},System.Collections.Generic.Dictionary{System.String,System.String},System.Collections.Generic.Dictionary{System.String,System.Int64},System.Func{System.Int32,System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.Byte[],System.Byte[]}},System.Collections.Generic.IEnumerable{``0}})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.CreateDirectStream``1(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.List{System.String},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.Int64}},System.Func{System.Int32,System.Collections.Generic.IEnumerable{System.Tuple{System.Byte[],System.Byte[]}},System.Collections.Generic.IEnumerable{``0}})">
|
||||
<summary>
|
||||
Create an input stream that directly pulls messages from a Kafka Broker and specific offset.
|
||||
|
||||
|
@ -8074,14 +8252,14 @@
|
|||
<param name="readFunc">user function to process the kafka data.</param>
|
||||
<returns>A DStream object</returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.GetOffsetRange(System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{System.Byte[],System.Byte[]}})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.GetOffsetRange(System.Collections.Generic.IEnumerable{System.Tuple{System.Byte[],System.Byte[]}})">
|
||||
<summary>
|
||||
create offset range from kafka messages when CSharpReader is enabled
|
||||
</summary>
|
||||
<param name="input"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.GetNumPartitionsFromConfig(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.List{System.String},System.Collections.Generic.Dictionary{System.String,System.String})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.KafkaUtils.GetNumPartitionsFromConfig(Microsoft.Spark.CSharp.Streaming.StreamingContext,System.Collections.Generic.List{System.String},System.Collections.Generic.IEnumerable{System.Tuple{System.String,System.String}})">
|
||||
<summary>
|
||||
topics should contain only one topic if choose to repartitions to a configured numPartitions
|
||||
TODO: move to scala and merge into DynamicPartitionKafkaRDD.getPartitions to remove above limitation
|
||||
|
@ -8202,7 +8380,7 @@
|
|||
<param name="idleDuration">The idle time of duration</param>
|
||||
<returns>The new StateSpec object</returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.StateSpec`4.InitialState(Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{`0,`2}})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.StateSpec`4.InitialState(Microsoft.Spark.CSharp.Core.RDD{System.Tuple{`0,`2}})">
|
||||
<summary>
|
||||
Set the RDD containing the initial states that will be used by mapWithState
|
||||
</summary>
|
||||
|
@ -8249,10 +8427,10 @@
|
|||
</member>
|
||||
<member name="T:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions">
|
||||
<summary>
|
||||
operations only available to KeyValuePair RDD
|
||||
operations only available to Tuple RDD
|
||||
</summary>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.ReduceByKey``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1,``1,``1},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.ReduceByKey``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Func{``1,``1,``1},System.Int32)">
|
||||
<summary>
|
||||
Return a new DStream by applying ReduceByKey to each RDD.
|
||||
</summary>
|
||||
|
@ -8263,7 +8441,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.CombineByKey``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``2},System.Func{``2,``1,``2},System.Func{``2,``2,``2},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.CombineByKey``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Func{``2},System.Func{``2,``1,``2},System.Func{``2,``2,``2},System.Int32)">
|
||||
<summary>
|
||||
Return a new DStream by applying combineByKey to each RDD.
|
||||
</summary>
|
||||
|
@ -8277,7 +8455,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.PartitionBy``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.PartitionBy``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Int32)">
|
||||
<summary>
|
||||
Return a new DStream in which each RDD are partitioned by numPartitions.
|
||||
</summary>
|
||||
|
@ -8287,7 +8465,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.MapValues``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1,``2})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.MapValues``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Func{``1,``2})">
|
||||
<summary>
|
||||
Return a new DStream by applying a map function to the value of
|
||||
each key-value pairs in this DStream without changing the key.
|
||||
|
@ -8299,7 +8477,7 @@
|
|||
<param name="func"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.FlatMapValues``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1,System.Collections.Generic.IEnumerable{``2}})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.FlatMapValues``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Func{``1,System.Collections.Generic.IEnumerable{``2}})">
|
||||
<summary>
|
||||
Return a new DStream by applying a flatmap function to the value
|
||||
of each key-value pairs in this DStream without changing the key.
|
||||
|
@ -8311,7 +8489,7 @@
|
|||
<param name="func"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.GroupByKey``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.GroupByKey``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Int32)">
|
||||
<summary>
|
||||
Return a new DStream by applying groupByKey on each RDD.
|
||||
</summary>
|
||||
|
@ -8321,7 +8499,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.GroupWith``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.GroupWith``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``2}},System.Int32)">
|
||||
<summary>
|
||||
Return a new DStream by applying 'cogroup' between RDDs of this DStream and `other` DStream.
|
||||
Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
|
||||
|
@ -8334,7 +8512,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.Join``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.Join``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``2}},System.Int32)">
|
||||
<summary>
|
||||
Return a new DStream by applying 'join' between RDDs of this DStream and `other` DStream.
|
||||
Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
|
||||
|
@ -8347,7 +8525,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.LeftOuterJoin``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.LeftOuterJoin``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``2}},System.Int32)">
|
||||
<summary>
|
||||
Return a new DStream by applying 'left outer join' between RDDs of this DStream and `other` DStream.
|
||||
Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
|
||||
|
@ -8360,7 +8538,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.RightOuterJoin``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.RightOuterJoin``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``2}},System.Int32)">
|
||||
<summary>
|
||||
Return a new DStream by applying 'right outer join' between RDDs of this DStream and `other` DStream.
|
||||
Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
|
||||
|
@ -8373,7 +8551,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.FullOuterJoin``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.FullOuterJoin``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``2}},System.Int32)">
|
||||
<summary>
|
||||
Return a new DStream by applying 'full outer join' between RDDs of this DStream and `other` DStream.
|
||||
Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
|
||||
|
@ -8386,7 +8564,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.GroupByKeyAndWindow``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Int32,System.Int32,System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.GroupByKeyAndWindow``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Int32,System.Int32,System.Int32)">
|
||||
<summary>
|
||||
Return a new DStream by applying `GroupByKey` over a sliding window.
|
||||
Similar to `DStream.GroupByKey()`, but applies it over a sliding window.
|
||||
|
@ -8403,7 +8581,7 @@
|
|||
<param name="numPartitions">Number of partitions of each RDD in the new DStream.</param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.ReduceByKeyAndWindow``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{``1,``1,``1},System.Func{``1,``1,``1},System.Int32,System.Int32,System.Int32,System.Func{System.Collections.Generic.KeyValuePair{``0,``1},System.Boolean})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.ReduceByKeyAndWindow``2(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Func{``1,``1,``1},System.Func{``1,``1,``1},System.Int32,System.Int32,System.Int32,System.Func{System.Tuple{``0,``1},System.Boolean})">
|
||||
<summary>
|
||||
Return a new DStream by applying incremental `reduceByKey` over a sliding window.
|
||||
|
||||
|
@ -8424,7 +8602,7 @@
|
|||
<param name="filterFunc">function to filter expired key-value pairs; only pairs that satisfy the function are retained set this to null if you do not want to filter</param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.UpdateStateByKey``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{System.Collections.Generic.IEnumerable{``1},``2,``2},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.UpdateStateByKey``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Func{System.Collections.Generic.IEnumerable{``1},``2,``2},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
|
||||
<summary>
|
||||
Return a new "state" DStream where the state for each key is updated by applying
|
||||
the given function on the previous state of the key and the new values of the key.
|
||||
|
@ -8441,7 +8619,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.UpdateStateByKey``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{``0,System.Tuple{System.Collections.Generic.IEnumerable{``1},``2}}},System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{``0,``2}}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.UpdateStateByKey``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Func{System.Collections.Generic.IEnumerable{System.Tuple{``0,System.Tuple{System.Collections.Generic.IEnumerable{``1},``2}}},System.Collections.Generic.IEnumerable{System.Tuple{``0,``2}}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
|
||||
<summary>
|
||||
Return a new "state" DStream where the state for each key is updated by applying
|
||||
the given function on the previous state of the key and the new values of the key.
|
||||
|
@ -8455,7 +8633,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.UpdateStateByKey``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},System.Func{System.Int32,System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{``0,System.Tuple{System.Collections.Generic.IEnumerable{``1},``2}}},System.Collections.Generic.IEnumerable{System.Collections.Generic.KeyValuePair{``0,``2}}},Microsoft.Spark.CSharp.Core.RDD{System.Collections.Generic.KeyValuePair{``0,``2}},System.Int32)">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.UpdateStateByKey``3(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},System.Func{System.Int32,System.Collections.Generic.IEnumerable{System.Tuple{``0,System.Tuple{System.Collections.Generic.IEnumerable{``1},``2}}},System.Collections.Generic.IEnumerable{System.Tuple{``0,``2}}},Microsoft.Spark.CSharp.Core.RDD{System.Tuple{``0,``2}},System.Int32)">
|
||||
<summary>
|
||||
Return a new "state" DStream where the state for each key is updated by applying
|
||||
the given function on the previous state of the key and the new values of the key.
|
||||
|
@ -8469,7 +8647,7 @@
|
|||
<param name="numPartitions"></param>
|
||||
<returns></returns>
|
||||
</member>
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.MapWithState``4(Microsoft.Spark.CSharp.Streaming.DStream{System.Collections.Generic.KeyValuePair{``0,``1}},Microsoft.Spark.CSharp.Streaming.StateSpec{``0,``1,``2,``3})">
|
||||
<member name="M:Microsoft.Spark.CSharp.Streaming.PairDStreamFunctions.MapWithState``4(Microsoft.Spark.CSharp.Streaming.DStream{System.Tuple{``0,``1}},Microsoft.Spark.CSharp.Streaming.StateSpec{``0,``1,``2,``3})">
|
||||
<summary>
|
||||
Return a new "state" DStream where the state for each key is updated by applying
|
||||
the given function on the previous state of the key and the new values of the key.
|
||||
|
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -1,4 +1,5 @@
|
|||
using System.IO;
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Collections.Generic;
|
||||
using System.Net;
|
||||
using System.Runtime.Serialization.Formatters.Binary;
|
||||
|
@ -76,7 +77,7 @@ namespace AdapterTest
|
|||
// write update
|
||||
int key = 0;
|
||||
int value = 100;
|
||||
KeyValuePair<int, dynamic> update = new KeyValuePair<int, dynamic>(key, value);
|
||||
Tuple<int, dynamic> update = new Tuple<int, dynamic>(key, value);
|
||||
var ms = new MemoryStream();
|
||||
var formatter = new BinaryFormatter();
|
||||
formatter.Serialize(ms, update);
|
||||
|
@ -107,7 +108,7 @@ namespace AdapterTest
|
|||
// write update
|
||||
int key = 1;
|
||||
int value = 1000;
|
||||
KeyValuePair<int, dynamic> update = new KeyValuePair<int, dynamic>(key, value);
|
||||
Tuple<int, dynamic> update = new Tuple<int, dynamic>(key, value);
|
||||
var ms = new MemoryStream();
|
||||
var formatter = new BinaryFormatter();
|
||||
formatter.Serialize(ms, update);
|
||||
|
@ -119,8 +120,8 @@ namespace AdapterTest
|
|||
byte[] receiveBuffer = new byte[1];
|
||||
s.Read(receiveBuffer, 0, 1);
|
||||
|
||||
Assert.IsTrue(Accumulator.accumulatorRegistry.ContainsKey(update.Key));
|
||||
var accumulator = Accumulator.accumulatorRegistry[update.Key] as Accumulator<int>;
|
||||
Assert.IsTrue(Accumulator.accumulatorRegistry.ContainsKey(update.Item1));
|
||||
var accumulator = Accumulator.accumulatorRegistry[update.Item1] as Accumulator<int>;
|
||||
Assert.AreEqual(accumulator.Value, value);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -119,6 +119,7 @@
|
|||
<Compile Include="PairRDDTest.cs" />
|
||||
<Compile Include="ComparableRDDTest.cs" />
|
||||
<Compile Include="DoubleRDDTest.cs" />
|
||||
<Compile Include="UdfRegistrationTest.cs" />
|
||||
<Compile Include="UserDefinedFunctionTest.cs" />
|
||||
<Compile Include="WeakObjectManagerTest.cs" />
|
||||
</ItemGroup>
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
using System;
|
||||
using System.Linq;
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Sql;
|
||||
using NUnit.Framework;
|
||||
|
||||
|
@ -46,5 +48,13 @@ namespace AdapterTest
|
|||
builder.Config("doublevalue", 3.5D);
|
||||
Assert.True(builder.options["doublevalue"].Equals("3.5", StringComparison.InvariantCultureIgnoreCase));
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestEnableHiveSupport()
|
||||
{
|
||||
var builder = new Builder();
|
||||
builder.EnableHiveSupport();
|
||||
Assert.True(builder.options["spark.sql.catalogImplementation"].Equals("hive", StringComparison.InvariantCultureIgnoreCase));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -48,8 +48,8 @@ namespace AdapterTest
|
|||
|
||||
foreach (object record in taken)
|
||||
{
|
||||
KeyValuePair<string, long> countByWord = (KeyValuePair<string, long>)record;
|
||||
Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22);
|
||||
Tuple<string, long> countByWord = (Tuple<string, long>)record;
|
||||
Assert.AreEqual(countByWord.Item2, countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 23 : 22);
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -91,7 +91,7 @@ namespace AdapterTest
|
|||
|
||||
var words = lines.FlatMap(l => l.Split(' '));
|
||||
|
||||
var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));
|
||||
var pairs = words.Map(w => new Tuple<string, int>(w, 1));
|
||||
|
||||
var wordCounts = pairs.PartitionBy().ReduceByKey((x, y) => x + y);
|
||||
|
||||
|
@ -102,8 +102,8 @@ namespace AdapterTest
|
|||
|
||||
foreach (object record in taken)
|
||||
{
|
||||
KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
|
||||
Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22);
|
||||
Tuple<string, int> countByWord = (Tuple<string, int>)record;
|
||||
Assert.AreEqual(countByWord.Item2, countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 23 : 22);
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -116,8 +116,8 @@ namespace AdapterTest
|
|||
|
||||
foreach (object record in taken)
|
||||
{
|
||||
KeyValuePair<string, List<int>> countByWord = (KeyValuePair<string, List<int>>)record;
|
||||
Assert.AreEqual(countByWord.Value.Count, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22);
|
||||
Tuple<string, List<int>> countByWord = (Tuple<string, List<int>>)record;
|
||||
Assert.AreEqual(countByWord.Item2.Count, countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 23 : 22);
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -130,8 +130,8 @@ namespace AdapterTest
|
|||
|
||||
foreach (object record in taken)
|
||||
{
|
||||
KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
|
||||
Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 46 : 44);
|
||||
Tuple<string, int> countByWord = (Tuple<string, int>)record;
|
||||
Assert.AreEqual(countByWord.Item2, countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 46 : 44);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -147,12 +147,12 @@ namespace AdapterTest
|
|||
|
||||
var words = lines.FlatMap(l => l.Split(' '));
|
||||
|
||||
var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));
|
||||
var pairs = words.Map(w => new Tuple<string, int>(w, 1));
|
||||
|
||||
var wordCounts = pairs.ReduceByKey((x, y) => x + y);
|
||||
|
||||
var left = wordCounts.Filter(x => x.Key != "quick" && x.Key != "lazy");
|
||||
var right = wordCounts.Filter(x => x.Key != "brown");
|
||||
var left = wordCounts.Filter(x => x.Item1 != "quick" && x.Item1 != "lazy");
|
||||
var right = wordCounts.Filter(x => x.Item1 != "brown");
|
||||
|
||||
var groupWith = left.GroupWith(right);
|
||||
groupWith.ForeachRDD((time, rdd) =>
|
||||
|
@ -162,15 +162,15 @@ namespace AdapterTest
|
|||
|
||||
foreach (object record in taken)
|
||||
{
|
||||
KeyValuePair<string, Tuple<List<int>, List<int>>> countByWord = (KeyValuePair<string, Tuple<List<int>, List<int>>>)record;
|
||||
if (countByWord.Key == "quick" || countByWord.Key == "lazy")
|
||||
Assert.AreEqual(countByWord.Value.Item1.Count, 0);
|
||||
else if (countByWord.Key == "brown")
|
||||
Assert.AreEqual(countByWord.Value.Item2.Count, 0);
|
||||
Tuple<string, Tuple<List<int>, List<int>>> countByWord = (Tuple<string, Tuple<List<int>, List<int>>>)record;
|
||||
if (countByWord.Item1 == "quick" || countByWord.Item1 == "lazy")
|
||||
Assert.AreEqual(countByWord.Item2.Item1.Count, 0);
|
||||
else if (countByWord.Item1 == "brown")
|
||||
Assert.AreEqual(countByWord.Item2.Item2.Count, 0);
|
||||
else
|
||||
{
|
||||
Assert.AreEqual(countByWord.Value.Item1[0], countByWord.Key == "The" || countByWord.Key == "dog" ? 23 : 22);
|
||||
Assert.AreEqual(countByWord.Value.Item2[0], countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22);
|
||||
Assert.AreEqual(countByWord.Item2.Item1[0], countByWord.Item1 == "The" || countByWord.Item1 == "dog" ? 23 : 22);
|
||||
Assert.AreEqual(countByWord.Item2.Item2[0], countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 23 : 22);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
@ -183,9 +183,9 @@ namespace AdapterTest
|
|||
|
||||
foreach (object record in taken)
|
||||
{
|
||||
KeyValuePair<string, Tuple<int, int>> countByWord = (KeyValuePair<string, Tuple<int, int>>)record;
|
||||
Assert.AreEqual(countByWord.Value.Item1, countByWord.Key == "The" || countByWord.Key == "dog" ? 23 : 22);
|
||||
Assert.AreEqual(countByWord.Value.Item2, countByWord.Key == "The" || countByWord.Key == "dog" ? 23 : 22);
|
||||
Tuple<string, Tuple<int, int>> countByWord = (Tuple<string, Tuple<int, int>>)record;
|
||||
Assert.AreEqual(countByWord.Item2.Item1, countByWord.Item1 == "The" || countByWord.Item1 == "dog" ? 23 : 22);
|
||||
Assert.AreEqual(countByWord.Item2.Item2, countByWord.Item1 == "The" || countByWord.Item1 == "dog" ? 23 : 22);
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -197,11 +197,11 @@ namespace AdapterTest
|
|||
|
||||
foreach (object record in taken)
|
||||
{
|
||||
KeyValuePair<string, Tuple<int, Option<int>>> countByWord = (KeyValuePair<string, Tuple<int, Option<int>>>)record;
|
||||
Assert.AreEqual(countByWord.Value.Item1, countByWord.Key == "The" || countByWord.Key == "dog" ? 23 : 22);
|
||||
Assert.IsTrue(countByWord.Key == "The" || countByWord.Key == "dog" ?
|
||||
countByWord.Value.Item2.IsDefined == true && countByWord.Value.Item2.GetValue() == 23 : (countByWord.Key == "brown" ?
|
||||
countByWord.Value.Item2.IsDefined == true == false : countByWord.Value.Item2.IsDefined == true && countByWord.Value.Item2.GetValue() == 22));
|
||||
Tuple<string, Tuple<int, Option<int>>> countByWord = (Tuple<string, Tuple<int, Option<int>>>)record;
|
||||
Assert.AreEqual(countByWord.Item2.Item1, countByWord.Item1 == "The" || countByWord.Item1 == "dog" ? 23 : 22);
|
||||
Assert.IsTrue(countByWord.Item1 == "The" || countByWord.Item1 == "dog" ?
|
||||
countByWord.Item2.Item2.IsDefined == true && countByWord.Item2.Item2.GetValue() == 23 : (countByWord.Item1 == "brown" ?
|
||||
countByWord.Item2.Item2.IsDefined == true == false : countByWord.Item2.Item2.IsDefined == true && countByWord.Item2.Item2.GetValue() == 22));
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -213,12 +213,12 @@ namespace AdapterTest
|
|||
|
||||
foreach (object record in taken)
|
||||
{
|
||||
KeyValuePair<string, Tuple<Option<int>, int>> countByWord = (KeyValuePair<string, Tuple<Option<int>, int>>)record;
|
||||
Assert.IsTrue(countByWord.Key == "The" || countByWord.Key == "dog" ?
|
||||
countByWord.Value.Item1.IsDefined == true && countByWord.Value.Item1.GetValue() == 23 :
|
||||
(countByWord.Key == "quick" || countByWord.Key == "lazy" ? countByWord.Value.Item1.IsDefined == false :
|
||||
countByWord.Value.Item1.IsDefined == true && countByWord.Value.Item1.GetValue() == 22));
|
||||
Assert.AreEqual(countByWord.Value.Item2, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22);
|
||||
Tuple<string, Tuple<Option<int>, int>> countByWord = (Tuple<string, Tuple<Option<int>, int>>)record;
|
||||
Assert.IsTrue(countByWord.Item1 == "The" || countByWord.Item1 == "dog" ?
|
||||
countByWord.Item2.Item1.IsDefined == true && countByWord.Item2.Item1.GetValue() == 23 :
|
||||
(countByWord.Item1 == "quick" || countByWord.Item1 == "lazy" ? countByWord.Item2.Item1.IsDefined == false :
|
||||
countByWord.Item2.Item1.IsDefined == true && countByWord.Item2.Item1.GetValue() == 22));
|
||||
Assert.AreEqual(countByWord.Item2.Item2, countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 23 : 22);
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -230,15 +230,15 @@ namespace AdapterTest
|
|||
|
||||
foreach (object record in taken)
|
||||
{
|
||||
KeyValuePair<string, Tuple<Option<int>, Option<int>>> countByWord = (KeyValuePair<string, Tuple<Option<int>, Option<int>>>)record;
|
||||
Assert.IsTrue(countByWord.Key == "The" || countByWord.Key == "dog" ?
|
||||
countByWord.Value.Item1.IsDefined == true && countByWord.Value.Item1.GetValue() == 23 :
|
||||
(countByWord.Key == "quick" || countByWord.Key == "lazy" ? countByWord.Value.Item1.IsDefined == false :
|
||||
countByWord.Value.Item1.IsDefined == true && countByWord.Value.Item1.GetValue() == 22));
|
||||
Tuple<string, Tuple<Option<int>, Option<int>>> countByWord = (Tuple<string, Tuple<Option<int>, Option<int>>>)record;
|
||||
Assert.IsTrue(countByWord.Item1 == "The" || countByWord.Item1 == "dog" ?
|
||||
countByWord.Item2.Item1.IsDefined == true && countByWord.Item2.Item1.GetValue() == 23 :
|
||||
(countByWord.Item1 == "quick" || countByWord.Item1 == "lazy" ? countByWord.Item2.Item1.IsDefined == false :
|
||||
countByWord.Item2.Item1.IsDefined == true && countByWord.Item2.Item1.GetValue() == 22));
|
||||
|
||||
Assert.IsTrue(countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ?
|
||||
countByWord.Value.Item2.IsDefined == true && countByWord.Value.Item2.GetValue() == 23 :
|
||||
(countByWord.Key == "brown" ? countByWord.Value.Item2.IsDefined == false : countByWord.Value.Item2.IsDefined == true && countByWord.Value.Item2.GetValue() == 22));
|
||||
Assert.IsTrue(countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ?
|
||||
countByWord.Item2.Item2.IsDefined == true && countByWord.Item2.Item2.GetValue() == 23 :
|
||||
(countByWord.Item1 == "brown" ? countByWord.Item2.Item2.IsDefined == false : countByWord.Item2.Item2.IsDefined == true && countByWord.Item2.Item2.GetValue() == 22));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -254,7 +254,7 @@ namespace AdapterTest
|
|||
|
||||
var words = lines.FlatMap(l => l.Split(' '));
|
||||
|
||||
var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));
|
||||
var pairs = words.Map(w => new Tuple<string, int>(w, 1));
|
||||
|
||||
var doubleCounts = pairs.GroupByKey().FlatMapValues(vs => vs).MapValues(v => 2 * v).ReduceByKey((x, y) => x + y);
|
||||
doubleCounts.ForeachRDD((time, rdd) =>
|
||||
|
@ -264,15 +264,15 @@ namespace AdapterTest
|
|||
|
||||
foreach (object record in taken)
|
||||
{
|
||||
KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
|
||||
Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 2 * 23 : 2 * 22);
|
||||
Tuple<string, int> countByWord = (Tuple<string, int>)record;
|
||||
Assert.AreEqual(countByWord.Item2, countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 2 * 23 : 2 * 22);
|
||||
}
|
||||
});
|
||||
|
||||
// disable pipeline to UpdateStateByKey which replys on checkpoint mock proxy doesn't support
|
||||
pairs.Cache();
|
||||
|
||||
var initialStateRdd = ssc.SparkContext.Parallelize(new[] { "AAA" }).Map( w => new KeyValuePair<string, int>("AAA", 22));
|
||||
var initialStateRdd = ssc.SparkContext.Parallelize(new[] { "AAA" }).Map( w => new Tuple<string, int>("AAA", 22));
|
||||
var state = pairs.UpdateStateByKey<string, int, int>((v, s) => s + (v as List<int>).Count, initialStateRdd);
|
||||
state.ForeachRDD((time, rdd) =>
|
||||
{
|
||||
|
@ -281,8 +281,8 @@ namespace AdapterTest
|
|||
|
||||
foreach (object record in taken)
|
||||
{
|
||||
KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
|
||||
Assert.AreEqual(countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22, countByWord.Value);
|
||||
Tuple<string, int> countByWord = (Tuple<string, int>)record;
|
||||
Assert.AreEqual(countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 23 : 22, countByWord.Item2);
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -295,8 +295,8 @@ namespace AdapterTest
|
|||
|
||||
foreach (object record in taken)
|
||||
{
|
||||
KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
|
||||
Assert.AreEqual(countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22, countByWord.Value);
|
||||
Tuple<string, int> countByWord = (Tuple<string, int>)record;
|
||||
Assert.AreEqual(countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 23 : 22, countByWord.Item2);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -330,7 +330,7 @@ namespace AdapterTest
|
|||
var ssc = new StreamingContext(new SparkContext(sparkContextProxy.Object, sparkConf), 10000L);
|
||||
|
||||
var dstreamProxy = new Mock<IDStreamProxy>();
|
||||
var pairDStream = new DStream<KeyValuePair<string, int>>(dstreamProxy.Object, ssc);
|
||||
var pairDStream = new DStream<Tuple<string, int>>(dstreamProxy.Object, ssc);
|
||||
|
||||
var stateSpec = new StateSpec<string, int, int, int>((k, v, s) => v);
|
||||
var stateDStream = pairDStream.MapWithState(stateSpec);
|
||||
|
@ -373,7 +373,7 @@ namespace AdapterTest
|
|||
Assert.IsNotNull(resultRdd);
|
||||
|
||||
// test when initialStateRdd is not null
|
||||
var initialStateRdd = new RDD<KeyValuePair<string, int>>(new Mock<IRDDProxy>().Object, null);
|
||||
var initialStateRdd = new RDD<Tuple<string, int>>(new Mock<IRDDProxy>().Object, null);
|
||||
var stateSpec2 = new StateSpec<string, int, int, int>((k, v, s) => v).InitialState(initialStateRdd).NumPartitions(2);
|
||||
var helper2 = new MapWithStateHelper<string, int, int, int>((t, rdd) => rdd, stateSpec2);
|
||||
|
||||
|
@ -404,13 +404,13 @@ namespace AdapterTest
|
|||
|
||||
var input = new dynamic[4];
|
||||
|
||||
var preStateRddRecord = new MapWithStateRDDRecord<string, int, int>(ticks - TimeSpan.FromSeconds(2).Ticks, new [] { new KeyValuePair<string, int>("1", 1), new KeyValuePair<string, int>("2", 2)});
|
||||
var preStateRddRecord = new MapWithStateRDDRecord<string, int, int>(ticks - TimeSpan.FromSeconds(2).Ticks, new [] { new Tuple<string, int>("1", 1), new Tuple<string, int>("2", 2)});
|
||||
preStateRddRecord.stateMap.Add("expired", new KeyedState<int>(0, ticks - TimeSpan.FromSeconds(60).Ticks));
|
||||
|
||||
input[0] = preStateRddRecord;
|
||||
input[1] = new KeyValuePair<string, int>("1", -1);
|
||||
input[2] = new KeyValuePair<string, int>("2", 2);
|
||||
input[3] = new KeyValuePair<string, int>("3", 3);
|
||||
input[1] = new Tuple<string, int>("1", -1);
|
||||
input[2] = new Tuple<string, int>("2", 2);
|
||||
input[3] = new Tuple<string, int>("3", 3);
|
||||
|
||||
var result = helper.Execute(1, input).GetEnumerator();
|
||||
Assert.IsNotNull(result);
|
||||
|
|
|
@ -19,7 +19,7 @@ namespace AdapterTest
|
|||
var sparkContext = new SparkContext(null);
|
||||
var lines = sparkContext.TextFile(Path.GetTempFileName());
|
||||
var words = lines.FlatMap(l => l.Split(' '));
|
||||
doubles = words.Map(w => new KeyValuePair<string, int>(w, 1)).ReduceByKey((x, y) => x + y).Map(kv => (double)kv.Value);
|
||||
doubles = words.Map(w => new Tuple<string, int>(w, 1)).ReduceByKey((x, y) => x + y).Map(kv => (double)kv.Item2);
|
||||
}
|
||||
|
||||
[Test]
|
||||
|
|
|
@ -22,7 +22,7 @@ namespace AdapterTest
|
|||
var streamingContextProxy = new Mock<IStreamingContextProxy>();
|
||||
var mockDstreamProxy = new Mock<IDStreamProxy>().Object;
|
||||
streamingContextProxy.Setup(
|
||||
m => m.EventHubsUnionStream(It.IsAny<Dictionary<string, string>>(), It.IsAny<StorageLevelType>()))
|
||||
m => m.EventHubsUnionStream(It.IsAny<IEnumerable<Tuple<string, string>>>(), It.IsAny<StorageLevelType>()))
|
||||
.Returns(mockDstreamProxy);
|
||||
|
||||
var mockSparkClrProxy = new Mock<ISparkCLRProxy>();
|
||||
|
@ -32,7 +32,7 @@ namespace AdapterTest
|
|||
|
||||
var sparkContext = new SparkContext(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy, new SparkConf(new Mock<ISparkConfProxy>().Object));
|
||||
var streamingContext = new StreamingContext(sparkContext, 123L);
|
||||
var dstream = EventHubsUtils.CreateUnionStream(streamingContext, new Dictionary<string, string>());
|
||||
var dstream = EventHubsUtils.CreateUnionStream(streamingContext, new List<Tuple<string, string>>());
|
||||
Assert.AreEqual(mockDstreamProxy, dstream.DStreamProxy);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -137,7 +137,7 @@ namespace AdapterTest.Mocks
|
|||
return this;
|
||||
}
|
||||
|
||||
public IRDDProxy SampleByKey(bool withReplacement, Dictionary<string, double> fractions, long seed)
|
||||
public IRDDProxy SampleByKey(bool withReplacement, IEnumerable<Tuple<string, double>> fractions, long seed)
|
||||
{
|
||||
return this;
|
||||
}
|
||||
|
@ -152,13 +152,13 @@ namespace AdapterTest.Mocks
|
|||
return null;
|
||||
}
|
||||
|
||||
public void SaveAsNewAPIHadoopDataset(IEnumerable<KeyValuePair<string, string>> conf)
|
||||
public void SaveAsNewAPIHadoopDataset(IEnumerable<Tuple<string, string>> conf)
|
||||
{ }
|
||||
|
||||
public void SaveAsNewAPIHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf)
|
||||
public void SaveAsNewAPIHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<Tuple<string, string>> conf)
|
||||
{ }
|
||||
|
||||
public void SaveAsHadoopDataset(IEnumerable<KeyValuePair<string, string>> conf)
|
||||
public void SaveAsHadoopDataset(IEnumerable<Tuple<string, string>> conf)
|
||||
{ }
|
||||
|
||||
public void SaveAsSequenceFile(string path, string compressionCodecClass)
|
||||
|
@ -168,7 +168,7 @@ namespace AdapterTest.Mocks
|
|||
{ }
|
||||
|
||||
|
||||
public void SaveAsHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf, string compressionCodecClass)
|
||||
public void SaveAsHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<Tuple<string, string>> conf, string compressionCodecClass)
|
||||
{ }
|
||||
|
||||
|
||||
|
|
|
@ -135,22 +135,22 @@ namespace AdapterTest.Mocks
|
|||
return new MockRddProxy(null);
|
||||
}
|
||||
|
||||
public IRDDProxy NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
|
||||
public IRDDProxy NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize)
|
||||
{
|
||||
return new MockRddProxy(null);
|
||||
}
|
||||
|
||||
public IRDDProxy NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
|
||||
public IRDDProxy NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize)
|
||||
{
|
||||
return new MockRddProxy(null);
|
||||
}
|
||||
|
||||
public IRDDProxy HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
|
||||
public IRDDProxy HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize)
|
||||
{
|
||||
return new MockRddProxy(null);
|
||||
}
|
||||
|
||||
public IRDDProxy HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
|
||||
public IRDDProxy HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<Tuple<string, string>> conf, int batchSize)
|
||||
{
|
||||
return new MockRddProxy(null);
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@ namespace AdapterTest.Mocks
|
|||
class MockSparkSessionProxy : ISparkSessionProxy
|
||||
{
|
||||
public ISqlContextProxy SqlContextProxy { get { return new MockSqlContextProxy(new MockSparkContextProxy(new MockSparkConfProxy()));} }
|
||||
public IUdfRegistration Udf { get; }
|
||||
public IUdfRegistrationProxy Udf { get; }
|
||||
public ICatalogProxy GetCatalog()
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
|
|
|
@ -4,11 +4,8 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Runtime.Serialization;
|
||||
using System.Runtime.Serialization.Formatters.Binary;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Proxy;
|
||||
|
||||
|
@ -39,17 +36,17 @@ namespace AdapterTest.Mocks
|
|||
return new MockDStreamProxy();
|
||||
}
|
||||
|
||||
public IDStreamProxy KafkaStream(Dictionary<string, int> topics, Dictionary<string, string> kafkaParams, Microsoft.Spark.CSharp.Core.StorageLevelType storageLevelType)
|
||||
public IDStreamProxy KafkaStream(IEnumerable<Tuple<string, int>> topics, IEnumerable<Tuple<string, string>> kafkaParams, Microsoft.Spark.CSharp.Core.StorageLevelType storageLevelType)
|
||||
{
|
||||
return new MockDStreamProxy();
|
||||
}
|
||||
|
||||
public IDStreamProxy DirectKafkaStream(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets)
|
||||
public IDStreamProxy DirectKafkaStream(List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams, IEnumerable<Tuple<string, long>> fromOffsets)
|
||||
{
|
||||
return new MockDStreamProxy();
|
||||
}
|
||||
|
||||
public IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets,
|
||||
public IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, IEnumerable<Tuple<string, string>> kafkaParams, IEnumerable<Tuple<string, long>> fromOffsets,
|
||||
int numPartitions, byte[] readFunc, string serializationMode)
|
||||
{
|
||||
return new MockDStreamProxy();
|
||||
|
@ -93,12 +90,22 @@ namespace AdapterTest.Mocks
|
|||
public IDStreamProxy CreateCSharpReducedWindowedDStream(IDStreamProxy jdstream, byte[] func, byte[] invFunc, int windowSeconds, int slideSeconds, string serializationMode)
|
||||
{
|
||||
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> f = (Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>>) formatter.Deserialize(new MemoryStream(func));
|
||||
RDD<dynamic> rdd = f(DateTime.UtcNow.Ticks,
|
||||
|
||||
var ticks = DateTime.UtcNow.Ticks;
|
||||
RDD<dynamic> rdd = f(ticks,
|
||||
new RDD<dynamic>((jdstream as MockDStreamProxy).rddProxy ?? new MockRddProxy(null), new SparkContext("", "")),
|
||||
new RDD<dynamic>((jdstream as MockDStreamProxy).rddProxy ?? new MockRddProxy(null), new SparkContext("", "")));
|
||||
return new MockDStreamProxy(rdd.RddProxy);
|
||||
}
|
||||
|
||||
if (invFunc == null) return new MockDStreamProxy(rdd.RddProxy);
|
||||
|
||||
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> invf = (Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>>) formatter.Deserialize(new MemoryStream(invFunc));
|
||||
RDD<dynamic> invRdd = invf(ticks,
|
||||
new RDD<dynamic>((jdstream as MockDStreamProxy).rddProxy ?? new MockRddProxy(null), new SparkContext("", "")),
|
||||
new RDD<dynamic>((jdstream as MockDStreamProxy).rddProxy ?? new MockRddProxy(null), new SparkContext("", "")));
|
||||
var difference = rdd.Subtract(invRdd);
|
||||
|
||||
return new MockDStreamProxy(difference.RddProxy);
|
||||
}
|
||||
|
||||
public IDStreamProxy CreateCSharpStateDStream(IDStreamProxy jdstream, byte[] func, string className, string serializationMode, string serializationMode2)
|
||||
{
|
||||
|
@ -119,7 +126,7 @@ namespace AdapterTest.Mocks
|
|||
return new MockDStreamProxy();
|
||||
}
|
||||
|
||||
public IDStreamProxy EventHubsUnionStream(Dictionary<string, string> eventHubsParams, StorageLevelType storageLevelType)
|
||||
public IDStreamProxy EventHubsUnionStream(IEnumerable<Tuple<string, string>> eventHubsParams, StorageLevelType storageLevelType)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
|
|
@ -10,7 +10,7 @@ namespace AdapterTest
|
|||
[TestFixture]
|
||||
public class PairRDDTest
|
||||
{
|
||||
private static RDD<KeyValuePair<string, int>> pairs;
|
||||
private static RDD<Tuple<string, int>> pairs;
|
||||
|
||||
[OneTimeSetUp]
|
||||
public static void Initialize()
|
||||
|
@ -18,7 +18,7 @@ namespace AdapterTest
|
|||
var sparkContext = new SparkContext(null);
|
||||
var lines = sparkContext.TextFile(Path.GetTempFileName());
|
||||
var words = lines.FlatMap(l => l.Split(' '));
|
||||
pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));
|
||||
pairs = words.Map(w => new Tuple<string, int>(w, 1));
|
||||
}
|
||||
|
||||
[Test]
|
||||
|
@ -27,7 +27,7 @@ namespace AdapterTest
|
|||
foreach (var record in pairs.CountByKey())
|
||||
{
|
||||
// the 1st paramter of AreEqual() method is the expected value, the 2nd one is the acutal value.
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -36,53 +36,53 @@ namespace AdapterTest
|
|||
{
|
||||
foreach (var record in pairs.GroupWith(pairs).Collect())
|
||||
{
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item1.Count);
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item2.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item1.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item2.Count);
|
||||
}
|
||||
foreach (var record in pairs.GroupWith(pairs, pairs).Collect())
|
||||
{
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item1.Count);
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item2.Count);
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item3.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item1.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item2.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item3.Count);
|
||||
}
|
||||
foreach (var record in pairs.GroupWith(pairs, pairs, pairs).Collect())
|
||||
{
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item1.Count);
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item2.Count);
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item3.Count);
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item4.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item1.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item2.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item3.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item4.Count);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test RDD.GroupWith() method with different KeyValuePair<K,V> types.
|
||||
/// Test RDD.GroupWith() method with different Tuple<K,V> types.
|
||||
/// </summary>
|
||||
[Test]
|
||||
public void TestPairRddGroupWith2()
|
||||
{
|
||||
var pairs1 = pairs.Map(p => new KeyValuePair<string, double>(p.Key, Convert.ToDouble(p.Value)));
|
||||
var pairs2 = pairs.Map(p => new KeyValuePair<string, string>(p.Key, p.Value.ToString()));
|
||||
var pairs3 = pairs.Map(p => new KeyValuePair<string, long>(p.Key, Convert.ToInt64(p.Value)));
|
||||
var pairs1 = pairs.Map(p => new Tuple<string, double>(p.Item1, Convert.ToDouble(p.Item2)));
|
||||
var pairs2 = pairs.Map(p => new Tuple<string, string>(p.Item1, p.Item2.ToString()));
|
||||
var pairs3 = pairs.Map(p => new Tuple<string, long>(p.Item1, Convert.ToInt64(p.Item2)));
|
||||
|
||||
foreach (var record in pairs.GroupWith(pairs1).Collect())
|
||||
{
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item1.Count);
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item2.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item1.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item2.Count);
|
||||
}
|
||||
|
||||
foreach (var record in pairs.GroupWith(pairs1, pairs2).Collect())
|
||||
{
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item1.Count);
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item2.Count);
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item3.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item1.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item2.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item3.Count);
|
||||
}
|
||||
|
||||
foreach (var record in pairs.GroupWith(pairs1, pairs2, pairs3).Collect())
|
||||
{
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item1.Count);
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item2.Count);
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item3.Count);
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Item4.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item1.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item2.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item3.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Item4.Count);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -90,10 +90,10 @@ namespace AdapterTest
|
|||
public void TestPairRddSubtractByKey()
|
||||
{
|
||||
var reduce = pairs.ReduceByKey((x, y) => x + y);
|
||||
var records = reduce.SubtractByKey(reduce.Filter(kvp => kvp.Key != "The")).Collect();
|
||||
var records = reduce.SubtractByKey(reduce.Filter(kvp => kvp.Item1 != "The")).Collect();
|
||||
Assert.AreEqual(1, records.Length);
|
||||
Assert.AreEqual("The", records[0].Key);
|
||||
Assert.AreEqual(23, records[0].Value);
|
||||
Assert.AreEqual("The", records[0].Item1);
|
||||
Assert.AreEqual(23, records[0].Item2);
|
||||
}
|
||||
|
||||
[Test]
|
||||
|
@ -105,12 +105,45 @@ namespace AdapterTest
|
|||
}
|
||||
}
|
||||
|
||||
[Serializable]
|
||||
private class IntWrapper
|
||||
{
|
||||
public IntWrapper(int value)
|
||||
{
|
||||
Value = value;
|
||||
}
|
||||
|
||||
public int Value { get; }
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestPairRddReduceByKeyWithObjects()
|
||||
{
|
||||
// The ReduceByKey method below fails with NPE if ReduceByKey
|
||||
// calls CombineByKey with () => default(V) as seed generator
|
||||
var sums = pairs
|
||||
.MapValues(value => new IntWrapper(value))
|
||||
.ReduceByKey((x, y) => new IntWrapper(x.Value + y.Value));
|
||||
|
||||
var result = sums
|
||||
.CollectAsMap()
|
||||
.Select(pair => new KeyValuePair<string, int>(pair.Key, pair.Value.Value))
|
||||
.ToList();
|
||||
|
||||
var expectedResult = pairs
|
||||
.ReduceByKey((x, y) => x + y)
|
||||
.CollectAsMap()
|
||||
.ToList();
|
||||
|
||||
Assert.That(result, Is.EquivalentTo(expectedResult));
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestPairRddFoldByKey()
|
||||
{
|
||||
foreach (var record in pairs.FoldByKey(() => 0, (x, y) => x + y).Collect())
|
||||
{
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -119,7 +152,7 @@ namespace AdapterTest
|
|||
{
|
||||
foreach (var record in pairs.AggregateByKey(() => 0, (x, y) => x + y, (x, y) => x + y).Collect())
|
||||
{
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -128,7 +161,7 @@ namespace AdapterTest
|
|||
{
|
||||
foreach (var record in pairs.GroupByKey().Collect())
|
||||
{
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Count);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -165,7 +198,7 @@ namespace AdapterTest
|
|||
[Test]
|
||||
public void TestPairRddSortByKey()
|
||||
{
|
||||
var expectedSortedRdd = pairs.Collect().OrderBy(kv => kv.Key, StringComparer.OrdinalIgnoreCase).ToArray();
|
||||
var expectedSortedRdd = pairs.Collect().OrderBy(kv => kv.Item1, StringComparer.OrdinalIgnoreCase).ToArray();
|
||||
var rddSortByKey = pairs.SortByKey(true, null, key => key.ToLowerInvariant()).Collect();
|
||||
CollectionAssert.AreEqual(expectedSortedRdd, rddSortByKey);
|
||||
}
|
||||
|
@ -173,11 +206,19 @@ namespace AdapterTest
|
|||
[Test]
|
||||
public void TestPairRddSortByKey2()
|
||||
{
|
||||
var expectedSortedRdd = pairs.Collect().OrderBy(kv => kv.Key, StringComparer.OrdinalIgnoreCase).ToArray();
|
||||
var expectedSortedRdd = pairs.Collect().OrderBy(kv => kv.Item1, StringComparer.OrdinalIgnoreCase).ToArray();
|
||||
var rddSortByKey = pairs.SortByKey(true, 1, key => key.ToLowerInvariant()).Collect();
|
||||
CollectionAssert.AreEqual(expectedSortedRdd, rddSortByKey);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestPairRddSortByKey3()
|
||||
{
|
||||
var expectedSortedRdd = pairs.Collect().OrderByDescending(kv => kv.Item1, StringComparer.OrdinalIgnoreCase).ToArray();
|
||||
var rddSortByKey = pairs.SortByKey(false, 1, key => key.ToLowerInvariant()).Collect();
|
||||
CollectionAssert.AreEqual(expectedSortedRdd, rddSortByKey);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestPairRddProxy()
|
||||
{
|
||||
|
|
|
@ -20,6 +20,7 @@ namespace AdapterTest
|
|||
public class RDDTest
|
||||
{
|
||||
private static RDD<string> words;
|
||||
private static RDD<string> empty;
|
||||
|
||||
[OneTimeSetUp]
|
||||
public static void Initialize()
|
||||
|
@ -27,6 +28,7 @@ namespace AdapterTest
|
|||
var sparkContext = new SparkContext(null);
|
||||
var lines = sparkContext.TextFile(Path.GetTempFileName());
|
||||
words = lines.FlatMap(l => l.Split(' '));
|
||||
empty = sparkContext.EmptyRDD<string>();
|
||||
}
|
||||
|
||||
[Test]
|
||||
|
@ -42,7 +44,7 @@ namespace AdapterTest
|
|||
{
|
||||
foreach (var record in words.CountByValue())
|
||||
{
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -82,6 +84,7 @@ namespace AdapterTest
|
|||
public void TestRddTreeAggregate()
|
||||
{
|
||||
Assert.AreEqual(201, words.Map(w => 1).TreeAggregate(0, (x, y) => x + y, (x, y) => x + y));
|
||||
Assert.Throws<ArgumentException>(() => empty.TreeAggregate(0, (x, y) => 1, (x, y) => x + y, 0));
|
||||
}
|
||||
|
||||
[Test]
|
||||
|
@ -119,14 +122,14 @@ namespace AdapterTest
|
|||
{
|
||||
words.GroupBy(w => w).Foreach(record =>
|
||||
{
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Count);
|
||||
});
|
||||
|
||||
words.GroupBy(w => w).ForeachPartition(iter =>
|
||||
{
|
||||
foreach (var record in iter)
|
||||
{
|
||||
Assert.AreEqual(record.Key == "The" || record.Key == "dog" || record.Key == "lazy" ? 23 : 22, record.Value.Count);
|
||||
Assert.AreEqual(record.Item1 == "The" || record.Item1 == "dog" || record.Item1 == "lazy" ? 23 : 22, record.Item2.Count);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -135,6 +138,7 @@ namespace AdapterTest
|
|||
public void TestRddIsEmpty()
|
||||
{
|
||||
Assert.IsFalse(words.IsEmpty());
|
||||
Assert.IsTrue(empty.IsEmpty());
|
||||
Assert.IsTrue(words.Filter(w => w == null).IsEmpty());
|
||||
}
|
||||
|
||||
|
@ -144,7 +148,7 @@ namespace AdapterTest
|
|||
int index = 0;
|
||||
foreach(var record in words.ZipWithIndex().Collect())
|
||||
{
|
||||
Assert.AreEqual(index++, record.Value);
|
||||
Assert.AreEqual(index++, record.Item2);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -155,7 +159,7 @@ namespace AdapterTest
|
|||
int num = words.GetNumPartitions();
|
||||
foreach (var record in words.ZipWithUniqueId().Collect())
|
||||
{
|
||||
Assert.AreEqual(num * index++, record.Value);
|
||||
Assert.AreEqual(num * index++, record.Item2);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -166,6 +170,7 @@ namespace AdapterTest
|
|||
Assert.AreEqual(20, words.TakeSample(true, 20, 1).Length);
|
||||
Assert.Throws<ArgumentException>(() => words.TakeSample(true, -1, 1));
|
||||
Assert.AreEqual(0, words.TakeSample(true, 0, 1).Length);
|
||||
Assert.AreEqual(20, words.TakeSample(false, 20, 1).Length);
|
||||
}
|
||||
|
||||
[Test]
|
||||
|
|
|
@ -152,6 +152,23 @@ namespace AdapterTest
|
|||
Assert.IsNotNull(hadoopConf);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestRunJob()
|
||||
{
|
||||
// Arrange
|
||||
Mock<ISparkContextProxy> sparkContextProxy = new Mock<ISparkContextProxy>();
|
||||
SparkContext sc = new SparkContext(sparkContextProxy.Object, null);
|
||||
RDD<int> rdd = sc.Parallelize(new int[] {0, 1, 2, 3, 4, 5}, 2);
|
||||
sparkContextProxy.Setup(m => m.RunJob(It.IsAny<IRDDProxy>(), It.IsAny<IEnumerable<int>>()));
|
||||
|
||||
// Act
|
||||
int[] partitions = new int[] { 0, 1 };
|
||||
rdd.SparkContext.RunJob(rdd, partitions);
|
||||
|
||||
// Assert
|
||||
sparkContextProxy.Verify(m => m.RunJob(rdd.RddProxy, partitions), Times.Once);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestCancelAllJobs()
|
||||
{
|
||||
|
@ -355,7 +372,7 @@ namespace AdapterTest
|
|||
SparkContext sc = new SparkContext(sparkContextProxy.Object, null);
|
||||
|
||||
// Act
|
||||
RDD<KeyValuePair<byte[], byte[]>> rdd = sc.WholeTextFiles(filePath, null);
|
||||
RDD<Tuple<byte[], byte[]>> rdd = sc.WholeTextFiles(filePath, null);
|
||||
|
||||
// Assert
|
||||
Assert.IsNotNull(rdd);
|
||||
|
@ -377,7 +394,7 @@ namespace AdapterTest
|
|||
SparkContext sc = new SparkContext(sparkContextProxy.Object, null);
|
||||
|
||||
// Act
|
||||
RDD<KeyValuePair<byte[], byte[]>> rdd = sc.BinaryFiles(filePath, null);
|
||||
RDD<Tuple<byte[], byte[]>> rdd = sc.BinaryFiles(filePath, null);
|
||||
|
||||
// Assert
|
||||
Assert.IsNotNull(rdd);
|
||||
|
@ -428,7 +445,7 @@ namespace AdapterTest
|
|||
|
||||
Mock<IRDDProxy> rddProxy = new Mock<IRDDProxy>();
|
||||
Mock<ISparkContextProxy> sparkContextProxy = new Mock<ISparkContextProxy>();
|
||||
sparkContextProxy.Setup(m => m.NewAPIHadoopFile(filePath, It.IsAny<string>(), keyClass, valueClass, keyConverterClass, valueConverterClass, It.IsAny<IEnumerable<KeyValuePair<string, string>>>(), It.IsAny<int>()))
|
||||
sparkContextProxy.Setup(m => m.NewAPIHadoopFile(filePath, It.IsAny<string>(), keyClass, valueClass, keyConverterClass, valueConverterClass, It.IsAny<IEnumerable<Tuple<string, string>>>(), It.IsAny<int>()))
|
||||
.Returns(rddProxy.Object);
|
||||
SparkContext sc = new SparkContext(sparkContextProxy.Object, null);
|
||||
|
||||
|
@ -456,7 +473,7 @@ namespace AdapterTest
|
|||
|
||||
Mock<IRDDProxy> rddProxy = new Mock<IRDDProxy>();
|
||||
Mock<ISparkContextProxy> sparkContextProxy = new Mock<ISparkContextProxy>();
|
||||
sparkContextProxy.Setup(m => m.HadoopFile(filePath, It.IsAny<string>(), keyClass, valueClass, keyConverterClass, valueConverterClass, It.IsAny<IEnumerable<KeyValuePair<string, string>>>(), It.IsAny<int>()))
|
||||
sparkContextProxy.Setup(m => m.HadoopFile(filePath, It.IsAny<string>(), keyClass, valueClass, keyConverterClass, valueConverterClass, It.IsAny<IEnumerable<Tuple<string, string>>>(), It.IsAny<int>()))
|
||||
.Returns(rddProxy.Object);
|
||||
SparkContext sc = new SparkContext(sparkContextProxy.Object, null);
|
||||
|
||||
|
@ -482,12 +499,12 @@ namespace AdapterTest
|
|||
|
||||
Mock<IRDDProxy> rddProxy = new Mock<IRDDProxy>();
|
||||
Mock<ISparkContextProxy> sparkContextProxy = new Mock<ISparkContextProxy>();
|
||||
sparkContextProxy.Setup(m => m.NewAPIHadoopRDD(It.IsAny<string>(), keyClass, valueClass, keyConverterClass, valueConverterClass, It.IsAny<IEnumerable<KeyValuePair<string, string>>>(), It.IsAny<int>()))
|
||||
sparkContextProxy.Setup(m => m.NewAPIHadoopRDD(It.IsAny<string>(), keyClass, valueClass, keyConverterClass, valueConverterClass, It.IsAny<IEnumerable<Tuple<string, string>>>(), It.IsAny<int>()))
|
||||
.Returns(rddProxy.Object);
|
||||
SparkContext sc = new SparkContext(sparkContextProxy.Object, null);
|
||||
|
||||
const string inputFormatClass = "org.apache.hadoop.mapreduce.lib.input.TextInputFormat";
|
||||
var conf = new KeyValuePair<string, string>[] { };
|
||||
var conf = new Tuple<string, string>[] { };
|
||||
// Act
|
||||
RDD<byte[]> rdd = sc.NewAPIHadoopRDD(inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, conf);
|
||||
|
||||
|
@ -509,12 +526,12 @@ namespace AdapterTest
|
|||
|
||||
Mock<IRDDProxy> rddProxy = new Mock<IRDDProxy>();
|
||||
Mock<ISparkContextProxy> sparkContextProxy = new Mock<ISparkContextProxy>();
|
||||
sparkContextProxy.Setup(m => m.HadoopRDD(It.IsAny<string>(), keyClass, valueClass, keyConverterClass, valueConverterClass, It.IsAny<IEnumerable<KeyValuePair<string, string>>>(), It.IsAny<int>()))
|
||||
sparkContextProxy.Setup(m => m.HadoopRDD(It.IsAny<string>(), keyClass, valueClass, keyConverterClass, valueConverterClass, It.IsAny<IEnumerable<Tuple<string, string>>>(), It.IsAny<int>()))
|
||||
.Returns(rddProxy.Object);
|
||||
SparkContext sc = new SparkContext(sparkContextProxy.Object, null);
|
||||
|
||||
const string inputFormatClass = "org.apache.hadoop.mapreduce.lib.input.TextInputFormat";
|
||||
var conf = new KeyValuePair<string, string>[] { };
|
||||
var conf = new Tuple<string, string>[] { };
|
||||
// Act
|
||||
RDD<byte[]> rdd = sc.HadoopRDD(inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, conf);
|
||||
|
||||
|
|
|
@ -32,22 +32,22 @@ namespace AdapterTest
|
|||
var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345);
|
||||
Assert.IsNotNull(socketStream.DStreamProxy);
|
||||
|
||||
var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new Dictionary<string, int> { { "testTopic1", 1 } }, new Dictionary<string, string>());
|
||||
var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new [] { Tuple.Create("testTopic1", 1) }, null);
|
||||
Assert.IsNotNull(kafkaStream.DStreamProxy);
|
||||
|
||||
var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List<string> { "testTopic2" }, new Dictionary<string, string>(), new Dictionary<string, long>());
|
||||
var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List<string> { "testTopic2" }, new List<Tuple<string, string>>(), new List<Tuple<string, long>>());
|
||||
Assert.IsNotNull(directKafkaStream.DStreamProxy);
|
||||
|
||||
ssc.SparkContext.SparkConf.Set("spark.mobius.streaming.kafka.numPartitions.testTopic3", "10");
|
||||
|
||||
var directKafkaStreamWithRepartition = KafkaUtils.CreateDirectStream(ssc, new List<string> { "testTopic3" }, new Dictionary<string, string>(), new Dictionary<string, long>());
|
||||
var directKafkaStreamWithRepartition = KafkaUtils.CreateDirectStream(ssc, new List<string> { "testTopic3" }, new List<Tuple<string, string>>(), new List<Tuple<string, long>>());
|
||||
Assert.IsNotNull(directKafkaStreamWithRepartition.DStreamProxy);
|
||||
|
||||
var directKafkaStreamWithRepartitionAndReadFunc = KafkaUtils.CreateDirectStream(
|
||||
ssc,
|
||||
new List<string> { "testTopic3" },
|
||||
new Dictionary<string, string>(), new Dictionary<string, long>(),
|
||||
(int pid, IEnumerable<KeyValuePair<byte[], byte[]>> input) => { return input; });
|
||||
new List<Tuple<string, string>>(), new List<Tuple<string, long>>(),
|
||||
(int pid, IEnumerable<Tuple<byte[], byte[]>> input) => { return input; });
|
||||
Assert.IsNotNull(directKafkaStreamWithRepartitionAndReadFunc);
|
||||
|
||||
ssc.SparkContext.SparkConf.Set("spark.mobius.streaming.kafka.numReceivers", "10");
|
||||
|
@ -55,8 +55,8 @@ namespace AdapterTest
|
|||
var directKafkaReceiver = KafkaUtils.CreateDirectStream(
|
||||
ssc,
|
||||
new List<string> { "testTopic3" },
|
||||
new Dictionary<string, string>(), new Dictionary<string, long>(),
|
||||
(int pid, IEnumerable<KeyValuePair<byte[], byte[]>> input) => { return input; });
|
||||
new List<Tuple<string, string>>(), new List<Tuple<string, long>>(),
|
||||
(int pid, IEnumerable<Tuple<byte[], byte[]>> input) => { return input; });
|
||||
Assert.IsNotNull(directKafkaReceiver.DStreamProxy);
|
||||
|
||||
var union = ssc.Union(textFile, socketStream);
|
||||
|
@ -99,10 +99,10 @@ namespace AdapterTest
|
|||
byte[] untilOffset = BitConverter.GetBytes(3L);
|
||||
Array.Reverse(untilOffset);
|
||||
|
||||
var offsetRange = KafkaUtils.GetOffsetRange(new List<KeyValuePair<byte[], byte[]>>
|
||||
var offsetRange = KafkaUtils.GetOffsetRange(new List<Tuple<byte[], byte[]>>
|
||||
{
|
||||
new KeyValuePair<byte[], byte[]>(Encoding.UTF8.GetBytes("testTopic,testClusterId"), partition),
|
||||
new KeyValuePair<byte[], byte[]>(fromOffset, untilOffset)
|
||||
new Tuple<byte[], byte[]>(Encoding.UTF8.GetBytes("testTopic,testClusterId"), partition),
|
||||
new Tuple<byte[], byte[]>(fromOffset, untilOffset)
|
||||
});
|
||||
|
||||
Assert.AreEqual(offsetRange.Topic, "testTopic");
|
||||
|
|
|
@ -175,7 +175,7 @@ namespace AdapterTest
|
|||
// Act
|
||||
var lines = _streamingContext.TextFileStream(Path.GetTempPath());
|
||||
var words = lines.FlatMap(l => l.Split(' '));
|
||||
var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));
|
||||
var pairs = words.Map(w => new Tuple<string, int>(w, 1));
|
||||
var wordCounts = pairs.ReduceByKey((x, y) => x + y);
|
||||
|
||||
// Assert
|
||||
|
@ -186,8 +186,8 @@ namespace AdapterTest
|
|||
|
||||
foreach (object record in taken)
|
||||
{
|
||||
KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
|
||||
Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22);
|
||||
Tuple<string, int> countByWord = (Tuple<string, int>)record;
|
||||
Assert.AreEqual(countByWord.Item2, countByWord.Item1 == "The" || countByWord.Item1 == "dog" || countByWord.Item1 == "lazy" ? 23 : 22);
|
||||
}
|
||||
});
|
||||
// Use Verify to verify if a method to mock was invoked
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using System;
|
||||
using Microsoft.Spark.CSharp.Proxy;
|
||||
using Microsoft.Spark.CSharp.Sql;
|
||||
using Moq;
|
||||
using NUnit.Framework;
|
||||
|
||||
namespace AdapterTest
|
||||
{
|
||||
[TestFixture]
|
||||
public class UdfRegistrationTest
|
||||
{
|
||||
[Test]
|
||||
public void TestRegisterFunction()
|
||||
{
|
||||
Mock<IUdfRegistrationProxy> mockUdfRegistrationProxy = new Mock<IUdfRegistrationProxy>();
|
||||
mockUdfRegistrationProxy.Setup(m => m.RegisterFunction(It.IsAny<string>(), It.IsAny<byte[]>(), It.IsAny<string>()));
|
||||
|
||||
var udfRegistration = new UdfRegistration(mockUdfRegistrationProxy.Object);
|
||||
|
||||
udfRegistration.RegisterFunction("Func0", () => "Func0");
|
||||
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func0", It.IsAny<byte[]>(), "string"));
|
||||
|
||||
udfRegistration.RegisterFunction<string, string>("Func1", s => "Func1");
|
||||
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func1", It.IsAny<byte[]>(), "string"));
|
||||
|
||||
udfRegistration.RegisterFunction<string, string, string>("Func2", (s1, s2) => "Func2");
|
||||
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func2", It.IsAny<byte[]>(), "string"));
|
||||
|
||||
udfRegistration.RegisterFunction<string, string, string, string>("Func3", (s1, s2, s3) => "Func3");
|
||||
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func3", It.IsAny<byte[]>(), "string"));
|
||||
|
||||
udfRegistration.RegisterFunction<string, string, string, string, string>("Func4", (s1, s2, s3, s4) => "Func4");
|
||||
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func4", It.IsAny<byte[]>(), "string"));
|
||||
|
||||
udfRegistration.RegisterFunction<string, string, string, string, string, string>("Func5", (s1, s2, s3, s4, s5) => "Func5");
|
||||
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func5", It.IsAny<byte[]>(), "string"));
|
||||
|
||||
udfRegistration.RegisterFunction<string, string, string, string, string, string, string>("Func6", (s1, s2, s3, s4, s5, s6) => "Func6");
|
||||
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func6", It.IsAny<byte[]>(), "string"));
|
||||
|
||||
udfRegistration.RegisterFunction<string, string, string, string, string, string, string, string>("Func7", (s1, s2, s3, s4, s5, s6, s7) => "Func7");
|
||||
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func7", It.IsAny<byte[]>(), "string"));
|
||||
|
||||
udfRegistration.RegisterFunction<string, string, string, string, string, string, string, string, string>("Func8", (s1, s2, s3, s4, s5, s6, s7, s8) => "Func8");
|
||||
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func8", It.IsAny<byte[]>(), "string"));
|
||||
|
||||
udfRegistration.RegisterFunction<string, string, string, string, string, string, string, string, string, string>("Func9", (s1, s2, s3, s4, s5, s6, s7, s8, s9) => "Func9");
|
||||
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func9", It.IsAny<byte[]>(), "string"));
|
||||
|
||||
udfRegistration.RegisterFunction<string, string, string, string, string, string, string, string, string, string, string>("Func10", (s1, s2, s3, s4, s5, s6, s7, s8, s9, s10) => "Func10");
|
||||
mockUdfRegistrationProxy.Verify(m => m.RegisterFunction("Func10", It.IsAny<byte[]>(), "string"));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -66,11 +66,11 @@ namespace Microsoft.Spark.CSharp.PerfBenchmark
|
|||
|
||||
var flaggedRows = parsedRows.Filter(s => s.Item1); //select good rows
|
||||
var selectedDeletions = flaggedRows.Filter(s => s.Item3.Equals(s.Item5)); //select deletions made by same creators
|
||||
var userDeletions = selectedDeletions.Map(s => new KeyValuePair<string, int>(s.Item3, 1));
|
||||
var userDeletions = selectedDeletions.Map(s => new Tuple<string, int>(s.Item3, 1));
|
||||
var userDeletionCount = userDeletions.ReduceByKey((x, y) => x + y);
|
||||
var userWithMaxDeletions = userDeletionCount.Fold(new KeyValuePair<string, int>("zerovalue", 0), (kvp1, kvp2) =>
|
||||
var userWithMaxDeletions = userDeletionCount.Fold(new Tuple<string, int>("zerovalue", 0), (kvp1, kvp2) =>
|
||||
{
|
||||
if (kvp1.Value > kvp2.Value)
|
||||
if (kvp1.Item2 > kvp2.Item2)
|
||||
return kvp1;
|
||||
else
|
||||
return kvp2;
|
||||
|
@ -79,7 +79,7 @@ namespace Microsoft.Spark.CSharp.PerfBenchmark
|
|||
stopwatch.Stop();
|
||||
PerfBenchmark.ExecutionTimeList.Add(stopwatch.Elapsed);
|
||||
|
||||
Console.WriteLine("User with max deletions is {0}, count of deletions={1}. Elapsed time={2}", userWithMaxDeletions.Key, userWithMaxDeletions.Value, stopwatch.Elapsed);
|
||||
Console.WriteLine("User with max deletions is {0}, count of deletions={1}. Elapsed time={2}", userWithMaxDeletions.Item1, userWithMaxDeletions.Item2, stopwatch.Elapsed);
|
||||
}
|
||||
|
||||
[PerfSuite]
|
||||
|
|
|
@ -75,14 +75,14 @@ namespace Microsoft.Spark.CSharp
|
|||
var lines = context.TextFileStream(Path.Combine(directory, "test"));
|
||||
lines = context.Union(lines, lines);
|
||||
var words = lines.FlatMap(l => l.Split(' '));
|
||||
var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));
|
||||
var pairs = words.Map(w => new Tuple<string, int>(w, 1));
|
||||
|
||||
// since operations like ReduceByKey, Join and UpdateStateByKey are
|
||||
// separate dstream transformations defined in CSharpDStream.scala
|
||||
// an extra CSharpRDD is introduced in between these operations
|
||||
var wordCounts = pairs.ReduceByKey((x, y) => x + y);
|
||||
var join = wordCounts.Window(2, 2).Join(wordCounts, 2);
|
||||
var initialStateRdd = sc.Parallelize( new[] {new KeyValuePair<string, int>("AAA", 88), new KeyValuePair<string, int>("BBB", 88)});
|
||||
var initialStateRdd = sc.Parallelize( new[] {new Tuple<string, int>("AAA", 88), new Tuple<string, int>("BBB", 88)});
|
||||
var state = join.UpdateStateByKey(new UpdateStateHelper(b).Execute, initialStateRdd);
|
||||
|
||||
state.ForeachRDD((time, rdd) =>
|
||||
|
@ -99,8 +99,8 @@ namespace Microsoft.Spark.CSharp
|
|||
{
|
||||
Console.WriteLine(record);
|
||||
|
||||
var countByWord = (KeyValuePair<string, int>)record;
|
||||
Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "lazy" || countByWord.Key == "dog" ? 92 : 88);
|
||||
var countByWord = (Tuple<string, int>)record;
|
||||
Assert.AreEqual(countByWord.Item2, countByWord.Item1 == "The" || countByWord.Item1 == "lazy" || countByWord.Item1 == "dog" ? 92 : 88);
|
||||
}
|
||||
Console.WriteLine();
|
||||
|
||||
|
@ -145,13 +145,13 @@ namespace Microsoft.Spark.CSharp
|
|||
StreamingContext context = new StreamingContext(sc, 2000L);
|
||||
context.Checkpoint(checkpointPath);
|
||||
|
||||
var kafkaParams = new Dictionary<string, string> {
|
||||
{"metadata.broker.list", brokers},
|
||||
{"auto.offset.reset", "smallest"}
|
||||
var kafkaParams = new List<Tuple<string, string>> {
|
||||
new Tuple<string, string>("metadata.broker.list", brokers),
|
||||
new Tuple<string, string>("auto.offset.reset", "smallest")
|
||||
};
|
||||
|
||||
conf.Set("spark.mobius.streaming.kafka.numPartitions." + topic, partitions.ToString());
|
||||
var dstream = KafkaUtils.CreateDirectStream(context, new List<string> { topic }, kafkaParams, new Dictionary<string, long>());
|
||||
var dstream = KafkaUtils.CreateDirectStream(context, new List<string> { topic }, kafkaParams, Enumerable.Empty<Tuple<string, long>>());
|
||||
|
||||
dstream.ForeachRDD((time, rdd) =>
|
||||
{
|
||||
|
@ -256,7 +256,7 @@ namespace Microsoft.Spark.CSharp
|
|||
// create the RDD
|
||||
var seedRDD = sc.Parallelize(Enumerable.Range(0, 100), numPartitions);
|
||||
var numbers = new ConstantInputDStream<int>(seedRDD, ssc);
|
||||
var pairs = numbers.Map(n => new KeyValuePair<int, int>(n % numPartitions, n));
|
||||
var pairs = numbers.Map(n => new Tuple<int, int>(n % numPartitions, n));
|
||||
var reduced = pairs.ReduceByKeyAndWindow(
|
||||
(int x, int y) => (x + y),
|
||||
(int x, int y) => (x - y),
|
||||
|
@ -283,10 +283,10 @@ namespace Microsoft.Spark.CSharp
|
|||
|
||||
foreach (object record in taken)
|
||||
{
|
||||
KeyValuePair<int, int> sum = (KeyValuePair<int, int>)record;
|
||||
Console.WriteLine("Key: {0}, Value: {1}", sum.Key, sum.Value);
|
||||
Tuple<int, int> sum = (Tuple<int, int>)record;
|
||||
Console.WriteLine("Key: {0}, Value: {1}", sum.Item1, sum.Item2);
|
||||
// when batch count reaches window size, sum of even/odd number stay at windowDuration / slideDuration * (2450, 2500) respectively
|
||||
Assert.AreEqual(sum.Value, (count > windowDuration / slideDuration ? windowDuration : count * slideDuration) / (bacthIntervalMs / 1000) * (sum.Key == 0 ? 2450 : 2500));
|
||||
Assert.AreEqual(sum.Item2, (count > windowDuration / slideDuration ? windowDuration : count * slideDuration) / (bacthIntervalMs / 1000) * (sum.Item1 == 0 ? 2450 : 2500));
|
||||
}
|
||||
});
|
||||
|
||||
|
|
|
@ -60,16 +60,16 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var lines = context.TextFileStream(Path.Combine(directory, "test1"));
|
||||
lines = context.Union(lines, lines);
|
||||
var words = lines.FlatMap(l => l.Split(' '));
|
||||
var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));
|
||||
var pairs = words.Map(w => new Tuple<string, int>(w, 1));
|
||||
|
||||
var wordCounts = pairs.ReduceByKey((x, y) => x + y);
|
||||
var initialState = sc.Parallelize(new[] { new KeyValuePair<string, int>("NOT_A_WORD", 1024), new KeyValuePair<string, int>("dog", 10000), }, 1);
|
||||
var stateSpec = new StateSpec<string, int, int, KeyValuePair<string, int>>((word, count, state) =>
|
||||
var initialState = sc.Parallelize(new[] { new Tuple<string, int>("NOT_A_WORD", 1024), new Tuple<string, int>("dog", 10000), }, 1);
|
||||
var stateSpec = new StateSpec<string, int, int, Tuple<string, int>>((word, count, state) =>
|
||||
{
|
||||
if (state.IsTimingOut())
|
||||
{
|
||||
Console.WriteLine("Found timing out word: {0}", word);
|
||||
return new KeyValuePair<string, int>(word, state.Get());
|
||||
return new Tuple<string, int>(word, state.Get());
|
||||
}
|
||||
|
||||
var sum = 0;
|
||||
|
@ -79,7 +79,7 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
}
|
||||
state.Update(sum + count);
|
||||
Console.WriteLine("word: {0}, count: {1}", word, sum + count);
|
||||
return new KeyValuePair<string, int>(word, sum + count);
|
||||
return new Tuple<string, int>(word, sum + count);
|
||||
}).NumPartitions(1).InitialState(initialState).Timeout(TimeSpan.FromSeconds(30));
|
||||
|
||||
var snapshots = wordCounts.MapWithState(stateSpec).StateSnapshots();
|
||||
|
@ -89,9 +89,9 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
Console.WriteLine("Snapshots @ Time: {0}", time);
|
||||
Console.WriteLine("-------------------------------------------");
|
||||
|
||||
foreach (KeyValuePair<string, int> record in rdd.Collect())
|
||||
foreach (Tuple<string, int> record in rdd.Collect())
|
||||
{
|
||||
Console.WriteLine("[{0}, {1}]", record.Key, record.Value);
|
||||
Console.WriteLine("[{0}, {1}]", record.Item1, record.Item2);
|
||||
}
|
||||
Console.WriteLine();
|
||||
});
|
||||
|
|
|
@ -15,7 +15,7 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
[Sample]
|
||||
internal static void PairRDDCollectAsMapSample()
|
||||
{
|
||||
var map = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<int, int>(1, 2), new KeyValuePair<int, int>(3, 4) }, 1).CollectAsMap();
|
||||
var map = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<int, int>(1, 2), new Tuple<int, int>(3, 4) }, 1).CollectAsMap();
|
||||
|
||||
foreach (var kv in map)
|
||||
Console.WriteLine(kv);
|
||||
|
@ -30,7 +30,7 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
[Sample]
|
||||
internal static void PairRDDKeysSample()
|
||||
{
|
||||
var keys = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<int, int>(1, 2), new KeyValuePair<int, int>(3, 4) }, 1).Keys().Collect();
|
||||
var keys = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<int, int>(1, 2), new Tuple<int, int>(3, 4) }, 1).Keys().Collect();
|
||||
|
||||
Console.WriteLine(keys[0]);
|
||||
Console.WriteLine(keys[1]);
|
||||
|
@ -45,7 +45,7 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
[Sample]
|
||||
internal static void PairRDDValuesSample()
|
||||
{
|
||||
var values = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<int, int>(1, 2), new KeyValuePair<int, int>(3, 4) }, 1).Values().Collect();
|
||||
var values = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<int, int>(1, 2), new Tuple<int, int>(3, 4) }, 1).Values().Collect();
|
||||
|
||||
Console.WriteLine(values[0]);
|
||||
Console.WriteLine(values[1]);
|
||||
|
@ -63,9 +63,9 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var reduced = SparkCLRSamples.SparkContext.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 1),
|
||||
new KeyValuePair<string, int>("a", 1)
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 1),
|
||||
new Tuple<string, int>("a", 1)
|
||||
}, 2)
|
||||
.ReduceByKey((x, y) => x + y).Collect();
|
||||
|
||||
|
@ -74,8 +74,8 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.IsTrue(reduced.Contains(new KeyValuePair<string, int>("a", 2)));
|
||||
Assert.IsTrue(reduced.Contains(new KeyValuePair<string, int>("b", 1)));
|
||||
Assert.IsTrue(reduced.Contains(new Tuple<string, int>("a", 2)));
|
||||
Assert.IsTrue(reduced.Contains(new Tuple<string, int>("b", 1)));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -85,9 +85,9 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var reduced = SparkCLRSamples.SparkContext.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 1),
|
||||
new KeyValuePair<string, int>("a", 1)
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 1),
|
||||
new Tuple<string, int>("a", 1)
|
||||
}, 2)
|
||||
.ReduceByKeyLocally((x, y) => x + y);
|
||||
|
||||
|
@ -107,11 +107,12 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var countByKey = SparkCLRSamples.SparkContext.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 1),
|
||||
new KeyValuePair<string, int>("a", 1)
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 1),
|
||||
new Tuple<string, int>("a", 1)
|
||||
}, 2)
|
||||
.CountByKey();
|
||||
.CountByKey()
|
||||
.ToDictionary(k => k.Item1, v => v.Item2);
|
||||
|
||||
foreach (var kv in countByKey)
|
||||
Console.WriteLine(kv);
|
||||
|
@ -129,15 +130,15 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var l = SparkCLRSamples.SparkContext.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 4),
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 4),
|
||||
}, 1);
|
||||
|
||||
var r = SparkCLRSamples.SparkContext.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 2),
|
||||
new KeyValuePair<string, int>("a", 3),
|
||||
new Tuple<string, int>("a", 2),
|
||||
new Tuple<string, int>("a", 3),
|
||||
}, 1);
|
||||
|
||||
var joined = l.Join(r, 2).Collect();
|
||||
|
@ -147,8 +148,8 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.IsTrue(joined.Contains(new KeyValuePair<string, Tuple<int, int>>("a", new Tuple<int, int>(1, 2))));
|
||||
Assert.IsTrue(joined.Contains(new KeyValuePair<string, Tuple<int, int>>("a", new Tuple<int, int>(1, 3))));
|
||||
Assert.IsTrue(joined.Contains(new Tuple<string, Tuple<int, int>>("a", new Tuple<int, int>(1, 2))));
|
||||
Assert.IsTrue(joined.Contains(new Tuple<string, Tuple<int, int>>("a", new Tuple<int, int>(1, 3))));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -158,14 +159,14 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var l = SparkCLRSamples.SparkContext.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 4),
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 4),
|
||||
}, 2);
|
||||
|
||||
var r = SparkCLRSamples.SparkContext.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 2),
|
||||
new Tuple<string, int>("a", 2),
|
||||
}, 1);
|
||||
|
||||
var joined = l.LeftOuterJoin(r).Collect();
|
||||
|
@ -175,8 +176,8 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.IsTrue(joined.Any(kv => kv.Key == "a" && kv.Value.Item1 == 1 && kv.Value.Item2.IsDefined && kv.Value.Item2.GetValue() == 2));
|
||||
Assert.IsTrue(joined.Any(kv => kv.Key == "b" && kv.Value.Item1 == 4 && !kv.Value.Item2.IsDefined));
|
||||
Assert.IsTrue(joined.Any(kv => kv.Item1 == "a" && kv.Item2.Item1 == 1 && kv.Item2.Item2.IsDefined && kv.Item2.Item2.GetValue() == 2));
|
||||
Assert.IsTrue(joined.Any(kv => kv.Item1 == "b" && kv.Item2.Item1 == 4 && !kv.Item2.Item2.IsDefined));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -186,14 +187,14 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var l = SparkCLRSamples.SparkContext.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 2),
|
||||
new Tuple<string, int>("a", 2),
|
||||
}, 1);
|
||||
|
||||
var r = SparkCLRSamples.SparkContext.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 4),
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 4),
|
||||
}, 2);
|
||||
|
||||
var joined = l.RightOuterJoin(r).Collect();
|
||||
|
@ -203,8 +204,8 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.IsTrue(joined.Any(kv => kv.Key == "a" && kv.Value.Item1.IsDefined && kv.Value.Item1.GetValue() == 2 && kv.Value.Item2 == 1));
|
||||
Assert.IsTrue(joined.Any(kv => kv.Key == "b" && !kv.Value.Item1.IsDefined && kv.Value.Item2 == 4));
|
||||
Assert.IsTrue(joined.Any(kv => kv.Item1 == "a" && kv.Item2.Item1.IsDefined && kv.Item2.Item1.GetValue() == 2 && kv.Item2.Item2 == 1));
|
||||
Assert.IsTrue(joined.Any(kv => kv.Item1 == "b" && !kv.Item2.Item1.IsDefined && kv.Item2.Item2 == 4));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -214,15 +215,15 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var l = SparkCLRSamples.SparkContext.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 4),
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 4),
|
||||
}, 2);
|
||||
|
||||
var r = SparkCLRSamples.SparkContext.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 2),
|
||||
new KeyValuePair<string, int>("c", 8),
|
||||
new Tuple<string, int>("a", 2),
|
||||
new Tuple<string, int>("c", 8),
|
||||
}, 2);
|
||||
|
||||
var joined = l.FullOuterJoin(r).Collect();
|
||||
|
@ -232,12 +233,12 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.IsTrue(joined.Any(kv => kv.Key == "a" && kv.Value.Item1.IsDefined && kv.Value.Item1.GetValue() == 1 &&
|
||||
kv.Value.Item2.IsDefined && kv.Value.Item2.GetValue() == 2));
|
||||
Assert.IsTrue(joined.Any(kv => kv.Key == "b" && kv.Value.Item1.IsDefined && kv.Value.Item1.GetValue() == 4 &&
|
||||
!kv.Value.Item2.IsDefined));
|
||||
Assert.IsTrue(joined.Any(kv => kv.Key == "c" && !kv.Value.Item1.IsDefined &&
|
||||
kv.Value.Item2.IsDefined && kv.Value.Item2.GetValue() == 8));
|
||||
Assert.IsTrue(joined.Any(kv => kv.Item1 == "a" && kv.Item2.Item1.IsDefined && kv.Item2.Item1.GetValue() == 1 &&
|
||||
kv.Item2.Item2.IsDefined && kv.Item2.Item2.GetValue() == 2));
|
||||
Assert.IsTrue(joined.Any(kv => kv.Item1 == "b" && kv.Item2.Item1.IsDefined && kv.Item2.Item1.GetValue() == 4 &&
|
||||
!kv.Item2.Item2.IsDefined));
|
||||
Assert.IsTrue(joined.Any(kv => kv.Item1 == "c" && !kv.Item2.Item1.IsDefined &&
|
||||
kv.Item2.Item2.IsDefined && kv.Item2.Item2.GetValue() == 8));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -252,7 +253,7 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
};
|
||||
|
||||
var partitioned = SparkCLRSamples.SparkContext.Parallelize(new[] { 1, 2, 3, 4, 5, 6, 1 }, 3)
|
||||
.Map(x => new KeyValuePair<int, int>(x, x + 100))
|
||||
.Map(x => new Tuple<int, int>(x, x + 100))
|
||||
.PartitionBy(3, partitionFunc)
|
||||
.Glom()
|
||||
.Collect();
|
||||
|
@ -270,9 +271,9 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
{
|
||||
Assert.AreEqual(3, partitioned.Length);
|
||||
// Assert that the partition distribution is correct with partitionFunc
|
||||
Assert.IsTrue(partitioned.Count(p => p.All(key => key.Key < 3)) == 1);
|
||||
Assert.IsTrue(partitioned.Count(p => p.All(key => key.Key >= 3 && key.Key < 6)) == 1);
|
||||
Assert.IsTrue(partitioned.Count(p => p.All(key => key.Key >= 6)) == 1);
|
||||
Assert.IsTrue(partitioned.Count(p => p.All(key => key.Item1 < 3)) == 1);
|
||||
Assert.IsTrue(partitioned.Count(p => p.All(key => key.Item1 >= 3 && key.Item1 < 6)) == 1);
|
||||
Assert.IsTrue(partitioned.Count(p => p.All(key => key.Item1 >= 6)) == 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -282,9 +283,9 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var combineByKey = SparkCLRSamples.SparkContext.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 1),
|
||||
new KeyValuePair<string, int>("a", 1)
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 1),
|
||||
new Tuple<string, int>("a", 1)
|
||||
}, 2)
|
||||
.CombineByKey(() => string.Empty, (x, y) => x + y.ToString(CultureInfo.InvariantCulture), (x, y) => x + y).Collect();
|
||||
|
||||
|
@ -293,8 +294,8 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.IsTrue(combineByKey.Contains(new KeyValuePair<string, string>("a", "11")));
|
||||
Assert.IsTrue(combineByKey.Contains(new KeyValuePair<string, string>("b", "1")));
|
||||
Assert.IsTrue(combineByKey.Contains(new Tuple<string, string>("a", "11")));
|
||||
Assert.IsTrue(combineByKey.Contains(new Tuple<string, string>("b", "1")));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -304,9 +305,9 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var aggregateByKey = SparkCLRSamples.SparkContext.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 1),
|
||||
new KeyValuePair<string, int>("a", 1)
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 1),
|
||||
new Tuple<string, int>("a", 1)
|
||||
}, 2)
|
||||
.AggregateByKey(() => 0, (x, y) => x + y, (x, y) => x + y).Collect();
|
||||
|
||||
|
@ -315,8 +316,8 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.IsTrue(aggregateByKey.Contains(new KeyValuePair<string, int>("a", 2)));
|
||||
Assert.IsTrue(aggregateByKey.Contains(new KeyValuePair<string, int>("b", 1)));
|
||||
Assert.IsTrue(aggregateByKey.Contains(new Tuple<string, int>("a", 2)));
|
||||
Assert.IsTrue(aggregateByKey.Contains(new Tuple<string, int>("b", 1)));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -326,9 +327,9 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var FoldByKey = SparkCLRSamples.SparkContext.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 1),
|
||||
new KeyValuePair<string, int>("a", 1)
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 1),
|
||||
new Tuple<string, int>("a", 1)
|
||||
}, 2)
|
||||
.FoldByKey(() => 0, (x, y) => x + y).Collect();
|
||||
|
||||
|
@ -337,8 +338,8 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.IsTrue(FoldByKey.Contains(new KeyValuePair<string, int>("a", 2)));
|
||||
Assert.IsTrue(FoldByKey.Contains(new KeyValuePair<string, int>("b", 1)));
|
||||
Assert.IsTrue(FoldByKey.Contains(new Tuple<string, int>("a", 2)));
|
||||
Assert.IsTrue(FoldByKey.Contains(new Tuple<string, int>("b", 1)));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -348,19 +349,19 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var groupByKey = SparkCLRSamples.SparkContext.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, int>("a", 1),
|
||||
new KeyValuePair<string, int>("b", 1),
|
||||
new KeyValuePair<string, int>("a", 1)
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 1),
|
||||
new Tuple<string, int>("a", 1)
|
||||
}, 2)
|
||||
.GroupByKey().Collect();
|
||||
|
||||
foreach (var kv in groupByKey)
|
||||
Console.WriteLine(kv.Key + ", " + "(" + string.Join(",", kv.Value) + ")");
|
||||
Console.WriteLine(kv.Item1 + ", " + "(" + string.Join(",", kv.Item2) + ")");
|
||||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.IsTrue(groupByKey.Any(kv => kv.Key == "a" && kv.Value.Count == 2 && kv.Value[0] == 1 && kv.Value[1] == 1));
|
||||
Assert.IsTrue(groupByKey.Any(kv => kv.Key == "b" && kv.Value.Count == 1 && kv.Value[0] == 1));
|
||||
Assert.IsTrue(groupByKey.Any(kv => kv.Item1 == "a" && kv.Item2.Count == 2 && kv.Item2[0] == 1 && kv.Item2[1] == 1));
|
||||
Assert.IsTrue(groupByKey.Any(kv => kv.Item1 == "b" && kv.Item2.Count == 1 && kv.Item2[0] == 1));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -370,8 +371,8 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var mapValues = SparkCLRSamples.SparkContext.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, string[]>("a", new[]{"apple", "banana", "lemon"}),
|
||||
new KeyValuePair<string, string[]>("b", new[]{"grapes"})
|
||||
new Tuple<string, string[]>("a", new[]{"apple", "banana", "lemon"}),
|
||||
new Tuple<string, string[]>("b", new[]{"grapes"})
|
||||
}, 2)
|
||||
.MapValues(x => x.Length).Collect();
|
||||
|
||||
|
@ -380,8 +381,8 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.IsTrue(mapValues.Any(kv => kv.Key == "a" && kv.Value == 3));
|
||||
Assert.IsTrue(mapValues.Any(kv => kv.Key == "b" && kv.Value == 1));
|
||||
Assert.IsTrue(mapValues.Any(kv => kv.Item1 == "a" && kv.Item2 == 3));
|
||||
Assert.IsTrue(mapValues.Any(kv => kv.Item1 == "b" && kv.Item2 == 1));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -391,8 +392,8 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var flatMapValues = SparkCLRSamples.SparkContext.Parallelize(
|
||||
new[]
|
||||
{
|
||||
new KeyValuePair<string, string[]>("a", new[]{"x", "y", "z"}),
|
||||
new KeyValuePair<string, string[]>("b", new[]{"p", "r"})
|
||||
new Tuple<string, string[]>("a", new[]{"x", "y", "z"}),
|
||||
new Tuple<string, string[]>("b", new[]{"p", "r"})
|
||||
}, 2)
|
||||
.FlatMapValues(x => x).Collect();
|
||||
|
||||
|
@ -401,48 +402,48 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.IsTrue(flatMapValues.Any(kv => kv.Key == "a" && kv.Value == "x"));
|
||||
Assert.IsTrue(flatMapValues.Any(kv => kv.Key == "a" && kv.Value == "y"));
|
||||
Assert.IsTrue(flatMapValues.Any(kv => kv.Key == "a" && kv.Value == "z"));
|
||||
Assert.IsTrue(flatMapValues.Any(kv => kv.Key == "b" && kv.Value == "p"));
|
||||
Assert.IsTrue(flatMapValues.Any(kv => kv.Key == "b" && kv.Value == "r"));
|
||||
Assert.IsTrue(flatMapValues.Any(kv => kv.Item1 == "a" && kv.Item2 == "x"));
|
||||
Assert.IsTrue(flatMapValues.Any(kv => kv.Item1 == "a" && kv.Item2 == "y"));
|
||||
Assert.IsTrue(flatMapValues.Any(kv => kv.Item1 == "a" && kv.Item2 == "z"));
|
||||
Assert.IsTrue(flatMapValues.Any(kv => kv.Item1 == "b" && kv.Item2 == "p"));
|
||||
Assert.IsTrue(flatMapValues.Any(kv => kv.Item1 == "b" && kv.Item2 == "r"));
|
||||
}
|
||||
}
|
||||
|
||||
[Sample]
|
||||
internal static void PairRDDGroupWithSample()
|
||||
{
|
||||
var x = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4)}, 2);
|
||||
var y = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<string, int>("a", 2)}, 1);
|
||||
var x = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<string, int>("a", 1), new Tuple<string, int>("b", 4)}, 2);
|
||||
var y = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<string, int>("a", 2)}, 1);
|
||||
|
||||
var groupWith = x.GroupWith(y).Collect();
|
||||
|
||||
foreach (var kv in groupWith)
|
||||
Console.WriteLine(kv.Key + ", " + "(" + string.Join(",", kv.Value) + ")");
|
||||
Console.WriteLine(kv.Item1 + ", " + "(" + string.Join(",", kv.Item2) + ")");
|
||||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.IsTrue(groupWith.Any(kv => kv.Key == "a" && kv.Value.Item1[0] == 1 && kv.Value.Item2[0] == 2));
|
||||
Assert.IsTrue(groupWith.Any(kv => kv.Key == "b" && kv.Value.Item1[0] == 4 && !kv.Value.Item2.Any()));
|
||||
Assert.IsTrue(groupWith.Any(kv => kv.Item1 == "a" && kv.Item2.Item1[0] == 1 && kv.Item2.Item2[0] == 2));
|
||||
Assert.IsTrue(groupWith.Any(kv => kv.Item1 == "b" && kv.Item2.Item1[0] == 4 && !kv.Item2.Item2.Any()));
|
||||
}
|
||||
}
|
||||
|
||||
[Sample]
|
||||
internal static void PairRDDGroupWithSample2()
|
||||
{
|
||||
var x = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<string, int>("a", 5), new KeyValuePair<string, int>("b", 6) }, 2);
|
||||
var y = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 2);
|
||||
var z = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<string, int>("a", 2) }, 1);
|
||||
var x = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<string, int>("a", 5), new Tuple<string, int>("b", 6) }, 2);
|
||||
var y = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<string, int>("a", 1), new Tuple<string, int>("b", 4) }, 2);
|
||||
var z = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<string, int>("a", 2) }, 1);
|
||||
|
||||
var groupWith = x.GroupWith(y, z).Collect();
|
||||
|
||||
foreach (var kv in groupWith)
|
||||
Console.WriteLine(kv.Key + ", " + "(" + string.Join(",", kv.Value) + ")");
|
||||
Console.WriteLine(kv.Item1 + ", " + "(" + string.Join(",", kv.Item2) + ")");
|
||||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.IsTrue(groupWith.Any(kv => kv.Key == "a" && kv.Value.Item1[0] == 5 && kv.Value.Item2[0] == 1 && kv.Value.Item3[0] == 2));
|
||||
Assert.IsTrue(groupWith.Any(kv => kv.Key == "b" && kv.Value.Item1[0] == 6 && kv.Value.Item2[0] == 4 && !kv.Value.Item3.Any()));
|
||||
Assert.IsTrue(groupWith.Any(kv => kv.Item1 == "a" && kv.Item2.Item1[0] == 5 && kv.Item2.Item2[0] == 1 && kv.Item2.Item3[0] == 2));
|
||||
Assert.IsTrue(groupWith.Any(kv => kv.Item1 == "b" && kv.Item2.Item1[0] == 6 && kv.Item2.Item2[0] == 4 && !kv.Item2.Item3.Any()));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -452,7 +453,7 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
//{
|
||||
// var fractions = new Dictionary<string, double> { { "a", 0.2 }, { "b", 0.1 } };
|
||||
// var rdd = SparkCLRSamples.SparkContext.Parallelize(fractions.Keys.ToArray(), 2).Cartesian(SparkCLRSamples.SparkContext.Parallelize(Enumerable.Range(0, 1000), 2));
|
||||
// var sample = rdd.Map(t => new KeyValuePair<string, int>(t.Item1, t.Item2)).SampleByKey(false, fractions, 2).GroupByKey().Collect();
|
||||
// var sample = rdd.Map(t => new Tuple<string, int>(t.Item1, t.Item2)).SampleByKey(false, fractions, 2).GroupByKey().Collect();
|
||||
|
||||
// Console.WriteLine(sample);
|
||||
//}
|
||||
|
@ -460,8 +461,8 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
[Sample]
|
||||
internal static void PairRDDSubtractByKeySample()
|
||||
{
|
||||
var x = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<string, int?>("a", 1), new KeyValuePair<string, int?>("b", 4), new KeyValuePair<string, int?>("b", 5), new KeyValuePair<string, int?>("a", 2) }, 2);
|
||||
var y = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<string, int?>("a", 3), new KeyValuePair<string, int?>("c", null) }, 2);
|
||||
var x = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<string, int?>("a", 1), new Tuple<string, int?>("b", 4), new Tuple<string, int?>("b", 5), new Tuple<string, int?>("a", 2) }, 2);
|
||||
var y = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<string, int?>("a", 3), new Tuple<string, int?>("c", null) }, 2);
|
||||
|
||||
var subtractByKey = x.SubtractByKey(y).Collect();
|
||||
|
||||
|
@ -471,15 +472,15 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.AreEqual(2, subtractByKey.Length);
|
||||
subtractByKey.Contains(new KeyValuePair<string, int?>("b", 4));
|
||||
subtractByKey.Contains(new KeyValuePair<string, int?>("b", 5));
|
||||
subtractByKey.Contains(new Tuple<string, int?>("b", 4));
|
||||
subtractByKey.Contains(new Tuple<string, int?>("b", 5));
|
||||
}
|
||||
}
|
||||
|
||||
[Sample]
|
||||
internal static void PairRDDLookupSample()
|
||||
{
|
||||
var rdd = SparkCLRSamples.SparkContext.Parallelize(Enumerable.Range(0, 1000).Zip(Enumerable.Range(0, 1000), (x, y) => new KeyValuePair<int, int>(x, y)), 10);
|
||||
var rdd = SparkCLRSamples.SparkContext.Parallelize(Enumerable.Range(0, 1000).Zip(Enumerable.Range(0, 1000), (x, y) => new Tuple<int, int>(x, y)), 10);
|
||||
var lookup42 = rdd.Lookup(42);
|
||||
var lookup1024 = rdd.Lookup(1024);
|
||||
Console.WriteLine(string.Join(",", lookup42));
|
||||
|
@ -495,9 +496,9 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
[Sample]
|
||||
internal static void PairRDDSortByKeySample()
|
||||
{
|
||||
var rdd = SparkCLRSamples.SparkContext.Parallelize(new[] { new KeyValuePair<string, int>("B", 2),
|
||||
new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("c", 3),
|
||||
new KeyValuePair<string, int>("E", 5), new KeyValuePair<string, int>("D", 4)}, 3);
|
||||
var rdd = SparkCLRSamples.SparkContext.Parallelize(new[] { new Tuple<string, int>("B", 2),
|
||||
new Tuple<string, int>("a", 1), new Tuple<string, int>("c", 3),
|
||||
new Tuple<string, int>("E", 5), new Tuple<string, int>("D", 4)}, 3);
|
||||
|
||||
var sortedRdd = rdd.SortByKey(true, 2);
|
||||
var sortedInTotal = sortedRdd.Collect();
|
||||
|
@ -507,7 +508,7 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
{
|
||||
Assert.AreEqual(2, sortedPartitions.Length);
|
||||
// by default SortByKey is case sensitive
|
||||
CollectionAssert.AreEqual(new[] { "B", "D", "E", "a", "c" }, sortedInTotal.Select(kv => kv.Key).ToArray());
|
||||
CollectionAssert.AreEqual(new[] { "B", "D", "E", "a", "c" }, sortedInTotal.Select(kv => kv.Item1).ToArray());
|
||||
}
|
||||
|
||||
// convert the keys to lower case in order to sort with case insensitive
|
||||
|
@ -518,7 +519,7 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.AreEqual(2, sortedPartitions.Length);
|
||||
CollectionAssert.AreEqual(new[] { "a", "B", "c", "D", "E" }, sortedInTotal.Select(kv => kv.Key).ToArray());
|
||||
CollectionAssert.AreEqual(new[] { "a", "B", "c", "D", "E" }, sortedInTotal.Select(kv => kv.Item1).ToArray());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
|
@ -129,7 +130,7 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var rdd = SparkCLRSamples.SparkContext.Parallelize(new int[] { 1, 1, 2, 3, 5, 8 }, 1);
|
||||
var groups = rdd.GroupBy(x => x % 2).Collect();
|
||||
foreach (var kv in groups)
|
||||
Console.WriteLine(kv.Key + ", " + string.Join(",", kv.Value));
|
||||
Console.WriteLine(kv.Item1 + ", " + string.Join(",", kv.Item2));
|
||||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
|
@ -137,9 +138,9 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
foreach (var kv in groups)
|
||||
{
|
||||
// the group with key=1 is odd numbers
|
||||
if (kv.Key == 1) CollectionAssert.AreEquivalent(new[] { 1, 1, 3, 5 }, kv.Value);
|
||||
if (kv.Item1 == 1) CollectionAssert.AreEquivalent(new[] { 1, 1, 3, 5 }, kv.Item2);
|
||||
// the group with key=0 is even numbers
|
||||
else if (kv.Key == 0) CollectionAssert.AreEquivalent(new[] { 2, 8 }, kv.Value);
|
||||
else if (kv.Item1 == 0) CollectionAssert.AreEquivalent(new[] { 2, 8 }, kv.Item2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -221,7 +222,10 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
[Sample]
|
||||
internal static void RDDCountByValueSample()
|
||||
{
|
||||
var countByValue = SparkCLRSamples.SparkContext.Parallelize(new int[] { 1, 2, 1, 2, 2 }, 2).CountByValue();
|
||||
var countByValue = SparkCLRSamples.SparkContext.Parallelize(new int[] { 1, 2, 1, 2, 2 }, 2)
|
||||
.CountByValue()
|
||||
.ToDictionary(k => k.Item1, v => v.Item2);
|
||||
|
||||
foreach (var item in countByValue)
|
||||
Console.WriteLine(item);
|
||||
|
||||
|
@ -292,10 +296,10 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.IsTrue(keyBy.Contains(new KeyValuePair<int, int>(1, 1)));
|
||||
Assert.IsTrue(keyBy.Contains(new KeyValuePair<int, int>(4, 2)));
|
||||
Assert.IsTrue(keyBy.Contains(new KeyValuePair<int, int>(9, 3)));
|
||||
Assert.IsTrue(keyBy.Contains(new KeyValuePair<int, int>(16, 4)));
|
||||
Assert.IsTrue(keyBy.Contains(new Tuple<int, int>(1, 1)));
|
||||
Assert.IsTrue(keyBy.Contains(new Tuple<int, int>(4, 2)));
|
||||
Assert.IsTrue(keyBy.Contains(new Tuple<int, int>(9, 3)));
|
||||
Assert.IsTrue(keyBy.Contains(new Tuple<int, int>(16, 4)));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -344,7 +348,7 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
{
|
||||
for (int i = 0; i < 5; i++)
|
||||
{
|
||||
Assert.IsTrue(zip.Contains(new KeyValuePair<int, int>(i, 1000 + i)));
|
||||
Assert.IsTrue(zip.Contains(new Tuple<int, int>(i, 1000 + i)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -358,10 +362,10 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.IsTrue(zipWithIndex.Contains(new KeyValuePair<string, long>("a", 0)));
|
||||
Assert.IsTrue(zipWithIndex.Contains(new KeyValuePair<string, long>("b", 1)));
|
||||
Assert.IsTrue(zipWithIndex.Contains(new KeyValuePair<string, long>("c", 2)));
|
||||
Assert.IsTrue(zipWithIndex.Contains(new KeyValuePair<string, long>("d", 3)));
|
||||
Assert.IsTrue(zipWithIndex.Contains(new Tuple<string, long>("a", 0)));
|
||||
Assert.IsTrue(zipWithIndex.Contains(new Tuple<string, long>("b", 1)));
|
||||
Assert.IsTrue(zipWithIndex.Contains(new Tuple<string, long>("c", 2)));
|
||||
Assert.IsTrue(zipWithIndex.Contains(new Tuple<string, long>("d", 3)));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -374,11 +378,11 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
Assert.IsTrue(zipWithUniqueId.Contains(new KeyValuePair<string, long>("a", 0)));
|
||||
Assert.IsTrue(zipWithUniqueId.Contains(new KeyValuePair<string, long>("b", 1)));
|
||||
Assert.IsTrue(zipWithUniqueId.Contains(new KeyValuePair<string, long>("c", 4)));
|
||||
Assert.IsTrue(zipWithUniqueId.Contains(new KeyValuePair<string, long>("d", 2)));
|
||||
Assert.IsTrue(zipWithUniqueId.Contains(new KeyValuePair<string, long>("e", 5)));
|
||||
Assert.IsTrue(zipWithUniqueId.Contains(new Tuple<string, long>("a", 0)));
|
||||
Assert.IsTrue(zipWithUniqueId.Contains(new Tuple<string, long>("b", 1)));
|
||||
Assert.IsTrue(zipWithUniqueId.Contains(new Tuple<string, long>("c", 4)));
|
||||
Assert.IsTrue(zipWithUniqueId.Contains(new Tuple<string, long>("d", 2)));
|
||||
Assert.IsTrue(zipWithUniqueId.Contains(new Tuple<string, long>("e", 5)));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -530,22 +534,22 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
|
||||
var words = lines.FlatMap(s => s.Split(' '));
|
||||
|
||||
var wordCounts = words.Map(w => new KeyValuePair<string, int>(w.Trim(), 1))
|
||||
var wordCounts = words.Map(w => new Tuple<string, int>(w.Trim(), 1))
|
||||
.ReduceByKey((x, y) => x + y).Collect();
|
||||
|
||||
Console.WriteLine("*** Printing words and their counts ***");
|
||||
foreach (var kvp in wordCounts)
|
||||
{
|
||||
Console.WriteLine("'{0}':{1}", kvp.Key, kvp.Value);
|
||||
Console.WriteLine("'{0}':{1}", kvp.Item1, kvp.Item2);
|
||||
}
|
||||
|
||||
var wordCountsCaseInsensitve = words.Map(w => new KeyValuePair<string, int>(w.ToLower().Trim(), 1))
|
||||
var wordCountsCaseInsensitve = words.Map(w => new Tuple<string, int>(w.ToLower().Trim(), 1))
|
||||
.ReduceByKey((x, y) => x + y).Collect();
|
||||
|
||||
Console.WriteLine("*** Printing words and their counts ignoring case ***");
|
||||
foreach (var kvp in wordCountsCaseInsensitve)
|
||||
{
|
||||
Console.WriteLine("'{0}':{1}", kvp.Key, kvp.Value);
|
||||
Console.WriteLine("'{0}':{1}", kvp.Item1, kvp.Item2);
|
||||
}
|
||||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
|
@ -553,7 +557,7 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var dictionary = new Dictionary<string, int>();
|
||||
foreach (var kvp in wordCounts)
|
||||
{
|
||||
dictionary[kvp.Key] = kvp.Value;
|
||||
dictionary[kvp.Item1] = kvp.Item2;
|
||||
}
|
||||
|
||||
Assert.AreEqual(22, dictionary["the"]);
|
||||
|
@ -563,7 +567,7 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var caseInsenstiveWordCountDictionary = new Dictionary<string, int>();
|
||||
foreach (var kvp in wordCountsCaseInsensitve)
|
||||
{
|
||||
caseInsenstiveWordCountDictionary[kvp.Key] = kvp.Value;
|
||||
caseInsenstiveWordCountDictionary[kvp.Item1] = kvp.Item2;
|
||||
}
|
||||
|
||||
Assert.AreEqual(45, caseInsenstiveWordCountDictionary["the"]);
|
||||
|
@ -584,12 +588,12 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var requestsColumns = requests.Map(s =>
|
||||
{
|
||||
var columns = s.Split(',');
|
||||
return new KeyValuePair<string, string[]>(columns[0], new[] { columns[1], columns[2], columns[3] });
|
||||
return new Tuple<string, string[]>(columns[0], new[] { columns[1], columns[2], columns[3] });
|
||||
});
|
||||
var metricsColumns = metrics.Map(s =>
|
||||
{
|
||||
var columns = s.Split(',');
|
||||
return new KeyValuePair<string, string[]>(columns[3], new[] { columns[4], columns[5], columns[6] });
|
||||
return new Tuple<string, string[]>(columns[3], new[] { columns[4], columns[5], columns[6] });
|
||||
});
|
||||
|
||||
var requestsJoinedWithMetrics = requestsColumns.Join(metricsColumns)
|
||||
|
@ -597,29 +601,29 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
s =>
|
||||
new []
|
||||
{
|
||||
s.Key, //guid
|
||||
s.Value.Item1[0], s.Value.Item1[1], s.Value.Item1[2], //dc, abtestid, traffictype
|
||||
s.Value.Item2[0],s.Value.Item2[1], s.Value.Item2[2] //lang, country, metric
|
||||
s.Item1, //guid
|
||||
s.Item2.Item1[0], s.Item2.Item1[1], s.Item2.Item1[2], //dc, abtestid, traffictype
|
||||
s.Item2.Item2[0],s.Item2.Item2[1], s.Item2.Item2[2] //lang, country, metric
|
||||
});
|
||||
|
||||
|
||||
var latencyByDatacenter = requestsJoinedWithMetrics.Map(i => new KeyValuePair<string, int> (i[1], int.Parse(i[6]))); //key is "datacenter"
|
||||
var latencyByDatacenter = requestsJoinedWithMetrics.Map(i => new Tuple<string, int> (i[1], int.Parse(i[6]))); //key is "datacenter"
|
||||
var maxLatencyByDataCenterList = latencyByDatacenter.ReduceByKey(Math.Max).Collect();
|
||||
|
||||
Console.WriteLine("***** Max latency metrics by DC *****");
|
||||
foreach (var keyValuePair in maxLatencyByDataCenterList)
|
||||
foreach (var Tuple in maxLatencyByDataCenterList)
|
||||
{
|
||||
Console.WriteLine("Datacenter={0}, Max latency={1}", keyValuePair.Key, keyValuePair.Value);
|
||||
Console.WriteLine("Datacenter={0}, Max latency={1}", Tuple.Item1, Tuple.Item2);
|
||||
}
|
||||
|
||||
var latencyAndCountByDatacenter = requestsJoinedWithMetrics.Map(i => new KeyValuePair<string, Tuple<int,int>> (i[1], new Tuple<int, int>(int.Parse(i[6]), 1)));
|
||||
var latencyAndCountByDatacenter = requestsJoinedWithMetrics.Map(i => new Tuple<string, Tuple<int,int>> (i[1], new Tuple<int, int>(int.Parse(i[6]), 1)));
|
||||
var sumLatencyAndCountByDatacenter = latencyAndCountByDatacenter.ReduceByKey((tuple, tuple1) => new Tuple<int, int>((tuple == null ? 0 : tuple.Item1) + tuple1.Item1, (tuple == null ? 0 : tuple.Item2) + tuple1.Item2));
|
||||
var sumLatencyAndCountByDatacenterList = sumLatencyAndCountByDatacenter.Collect();
|
||||
|
||||
Console.WriteLine("***** Mean latency metrics by DC *****");
|
||||
foreach (var keyValuePair in sumLatencyAndCountByDatacenterList)
|
||||
foreach (var Tuple in sumLatencyAndCountByDatacenterList)
|
||||
{
|
||||
Console.WriteLine("Datacenter={0}, Mean latency={1}", keyValuePair.Key, keyValuePair.Value.Item1/keyValuePair.Value.Item2);
|
||||
Console.WriteLine("Datacenter={0}, Mean latency={1}", Tuple.Item1, Tuple.Item2.Item1/Tuple.Item2.Item2);
|
||||
}
|
||||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
|
@ -627,7 +631,7 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var dictionary = new Dictionary<string, int>();
|
||||
foreach (var kvp in maxLatencyByDataCenterList)
|
||||
{
|
||||
dictionary[kvp.Key] = kvp.Value;
|
||||
dictionary[kvp.Item1] = kvp.Item2;
|
||||
}
|
||||
|
||||
Assert.AreEqual(835, dictionary["iowa"]);
|
||||
|
@ -636,7 +640,7 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var meanDictionary = new Dictionary<string, Tuple<int, int>>();
|
||||
foreach (var kvp in sumLatencyAndCountByDatacenterList)
|
||||
{
|
||||
meanDictionary[kvp.Key] = new Tuple<int, int>(kvp.Value.Item1, kvp.Value.Item2);
|
||||
meanDictionary[kvp.Item1] = new Tuple<int, int>(kvp.Item2.Item1, kvp.Item2.Item2);
|
||||
}
|
||||
|
||||
Assert.AreEqual(1621, meanDictionary["iowa"].Item1);
|
||||
|
@ -737,7 +741,7 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
var markets = SparkCLRSamples.SparkContext.TextFile(SparkCLRSamples.Configuration.GetInputDataPath("market.tab"), 1);
|
||||
long totalMarketsCount = markets.Count();
|
||||
|
||||
var marketsByKey = markets.Map(x => new KeyValuePair<string, string>(x.Substring(0, x.IndexOf('-')), x));
|
||||
var marketsByKey = markets.Map(x => new Tuple<string, string>(x.Substring(0, x.IndexOf('-')), x));
|
||||
var categories = marketsByKey.PartitionBy(2)
|
||||
.CombineByKey(() => "", (c, v) => v.Substring(0, v.IndexOf('-')), (c1, c2) => c1, 2);
|
||||
var categoriesCollectedCount = categories.Collect().Count();
|
||||
|
|
|
@ -185,5 +185,31 @@ namespace Microsoft.Spark.CSharp.Samples
|
|||
Assert.AreEqual(schemaPeople.Json, dataFramePeople.Schema.Json);
|
||||
}
|
||||
}
|
||||
|
||||
[Sample]
|
||||
internal static void SparkSessionUdfSample()
|
||||
{
|
||||
GetSparkSession().Udf.RegisterFunction<string, string, string>("FullAddress", (city, state) => city + " " + state);
|
||||
GetSparkSession().Udf.RegisterFunction<bool, string, int>("PeopleFilter", (name, age) => name == "Bill" && age > 80);
|
||||
|
||||
var peopleDataFrame = GetSparkSession().Read().Json(SparkCLRSamples.Configuration.GetInputDataPath(DataFrameSamples.PeopleJson));
|
||||
var functionAppliedDF = peopleDataFrame.SelectExpr("name", "age * 2 as age",
|
||||
"FullAddress(address.city, address.state) as address")
|
||||
.Where("PeopleFilter(name, age)");
|
||||
|
||||
functionAppliedDF.ShowSchema();
|
||||
functionAppliedDF.Show();
|
||||
|
||||
if (SparkCLRSamples.Configuration.IsValidationEnabled)
|
||||
{
|
||||
var collected = functionAppliedDF.Collect().ToArray();
|
||||
CollectionAssert.AreEquivalent(new[] { "name", "age", "address" },
|
||||
functionAppliedDF.Schema.Fields.Select(f => f.Name).ToArray());
|
||||
Assert.AreEqual(1, collected.Length);
|
||||
Assert.AreEqual("Bill", collected[0].Get("name"));
|
||||
Assert.AreEqual(86, collected[0].Get("age"));
|
||||
Assert.AreEqual("Seattle Washington", collected[0].Get("address"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -533,7 +533,7 @@ namespace Microsoft.Spark.CSharp
|
|||
.GetField("value", BindingFlags.NonPublic | BindingFlags.Instance)
|
||||
.GetValue(item.Value);
|
||||
logger.LogDebug("({0}, {1})", item.Key, value);
|
||||
formatter.Serialize(ms, new KeyValuePair<int, dynamic>(item.Key, value));
|
||||
formatter.Serialize(ms, new Tuple<int, dynamic>(item.Key, value));
|
||||
byte[] buffer = ms.ToArray();
|
||||
SerDe.Write(networkStream, buffer.Length);
|
||||
SerDe.Write(networkStream, buffer);
|
||||
|
@ -649,7 +649,7 @@ namespace Microsoft.Spark.CSharp
|
|||
}
|
||||
watch.Stop();
|
||||
|
||||
yield return new KeyValuePair<byte[], byte[]>(pairKey, pairValue);
|
||||
yield return new Tuple<byte[], byte[]>(pairKey, pairValue);
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
|
@ -573,7 +573,7 @@ namespace WorkerTest
|
|||
{
|
||||
WritePayloadHeaderToWorker(s);
|
||||
byte[] command = SparkContext.BuildCommand(
|
||||
new CSharpWorkerFunc((pid, iter) => iter.Cast<KeyValuePair<byte[], byte[]>>().Select(pair => pair.Key)),
|
||||
new CSharpWorkerFunc((pid, iter) => iter.Cast<Tuple<byte[], byte[]>>().Select(pair => pair.Item1)),
|
||||
SerializedMode.Pair, SerializedMode.None);
|
||||
|
||||
SerDe.Write(s, command.Length);
|
||||
|
@ -713,7 +713,7 @@ namespace WorkerTest
|
|||
/// <summary>
|
||||
/// read accumulator
|
||||
/// </summary>
|
||||
private IEnumerable<KeyValuePair<int, dynamic>> ReadAccumulator(Stream s, int expectedCount = 0)
|
||||
private IEnumerable<Tuple<int, dynamic>> ReadAccumulator(Stream s, int expectedCount = 0)
|
||||
{
|
||||
int count = 0;
|
||||
var formatter = new BinaryFormatter();
|
||||
|
@ -723,7 +723,7 @@ namespace WorkerTest
|
|||
if (length > 0)
|
||||
{
|
||||
var ms = new MemoryStream(SerDe.ReadBytes(s, length));
|
||||
yield return (KeyValuePair<int, dynamic>)formatter.Deserialize(ms);
|
||||
yield return (Tuple<int, dynamic>)formatter.Deserialize(ms);
|
||||
|
||||
if (expectedCount > 0 && ++count >= expectedCount)
|
||||
{
|
||||
|
@ -780,8 +780,8 @@ namespace WorkerTest
|
|||
int accumulatorsCount = SerDe.ReadInt(s);
|
||||
Assert.IsTrue(accumulatorsCount == 1);
|
||||
var accumulatorFromWorker = ReadAccumulator(s, accumulatorsCount).First();
|
||||
Assert.AreEqual(accumulatorId, accumulatorFromWorker.Key);
|
||||
Assert.AreEqual(expectedCount, accumulatorFromWorker.Value);
|
||||
Assert.AreEqual(accumulatorId, accumulatorFromWorker.Item1);
|
||||
Assert.AreEqual(expectedCount, accumulatorFromWorker.Item2);
|
||||
|
||||
SerDe.ReadInt(s);
|
||||
}
|
||||
|
|
|
@ -32,17 +32,17 @@
|
|||
<WarningLevel>4</WarningLevel>
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<Reference Include="CSharpWorker, Version=1.5.2.0, Culture=neutral, processorArchitecture=MSIL">
|
||||
<Reference Include="CSharpWorker">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="log4net, Version=1.2.15.0, Culture=neutral, PublicKeyToken=669e0ddf0bb1aa2a, processorArchitecture=MSIL">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Microsoft.Spark.CSharp.Adapter, Version=1.6.1.0, Culture=neutral, processorArchitecture=MSIL">
|
||||
<Reference Include="Microsoft.Spark.CSharp.Adapter">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Newtonsoft.Json, Version=4.5.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
|
@ -67,7 +67,7 @@
|
|||
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe.config">
|
||||
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe.config">
|
||||
<Link>CSharpWorker.exe.config</Link>
|
||||
</None>
|
||||
<None Include="..\..\App.config">
|
||||
|
|
|
@ -4,5 +4,5 @@
|
|||
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
|
||||
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
|
||||
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
|
||||
</packages>
|
||||
|
|
|
@ -35,17 +35,17 @@
|
|||
<WarningLevel>4</WarningLevel>
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<Reference Include="CSharpWorker, Version=1.5.2.0, Culture=neutral, processorArchitecture=MSIL">
|
||||
<Reference Include="CSharpWorker">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="log4net, Version=1.2.15.0, Culture=neutral, PublicKeyToken=669e0ddf0bb1aa2a, processorArchitecture=MSIL">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Microsoft.Spark.CSharp.Adapter, Version=1.6.1.0, Culture=neutral, processorArchitecture=MSIL">
|
||||
<Reference Include="Microsoft.Spark.CSharp.Adapter">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Newtonsoft.Json, Version=4.5.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
|
@ -66,7 +66,7 @@
|
|||
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe.config">
|
||||
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe.config">
|
||||
<Link>CSharpWorker.exe.config</Link>
|
||||
</None>
|
||||
<None Include="..\..\App.config">
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<packages>
|
||||
<package id="log4net" version="2.0.5" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
|
||||
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
|
||||
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
|
||||
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />
|
||||
|
|
|
@ -35,13 +35,13 @@
|
|||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<Reference Include="CSharpWorker">
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="log4net">
|
||||
<HintPath>..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Microsoft.Spark.CSharp.Adapter">
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Newtonsoft.Json, Version=4.5.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
|
|
|
@ -4,5 +4,5 @@
|
|||
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
|
||||
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
|
||||
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
|
||||
</packages>
|
||||
|
|
|
@ -37,12 +37,12 @@
|
|||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="CSharpWorker, Version=1.6.1.0, Culture=neutral, processorArchitecture=MSIL">
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
|
||||
<Reference Include="CSharpWorker">
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
|
||||
<Private>True</Private>
|
||||
</Reference>
|
||||
<Reference Include="Microsoft.Spark.CSharp.Adapter, Version=1.6.1.0, Culture=neutral, processorArchitecture=MSIL">
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
<Reference Include="Microsoft.Spark.CSharp.Adapter">
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
<Private>True</Private>
|
||||
</Reference>
|
||||
<Reference Include="System" />
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using System;
|
||||
|
@ -20,21 +20,40 @@ namespace Microsoft.Spark.CSharp.Examples
|
|||
{
|
||||
LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set
|
||||
var logger = LoggerServiceFactory.GetLogger(typeof(HiveDataFrameExample));
|
||||
|
||||
var sparkConf = new SparkConf();
|
||||
var sparkContext = new SparkContext(sparkConf);
|
||||
var hiveContext = new HiveContext(sparkContext);
|
||||
|
||||
// please give the path to input json file
|
||||
var jsonFilePath = args[0];
|
||||
var peopleDataFrame = hiveContext.Read().Json(jsonFilePath);
|
||||
|
||||
const string dbName = "SampleHiveDataBaseForMobius";
|
||||
const string tableName = "people";
|
||||
|
||||
var builder = SparkSession.Builder().EnableHiveSupport();
|
||||
// The following setting is required to use Spark 2.0 in Windows
|
||||
// It may be provided in command line when running Mobius app
|
||||
//builder = builder.Config("spark.sql.warehouse.dir", "<hdfs or local path>");
|
||||
var session = builder.GetOrCreate();
|
||||
var peopleDataFrame = session.Read().Json(jsonFilePath);
|
||||
session.Sql(string.Format("CREATE DATABASE IF NOT EXISTS {0}", dbName)); // create database if not exists
|
||||
session.Sql(string.Format("USE {0}", dbName));
|
||||
//hiveContext.Sql(string.Format("DROP TABLE {0}", tableName)); // drop table if exists
|
||||
|
||||
peopleDataFrame.Write().Mode(SaveMode.Overwrite).SaveAsTable(tableName); // create table
|
||||
var tablesDataFrame = session.Table(tableName); // get all tables in database
|
||||
logger.LogInfo(string.Format("table count in database {0}: {1}", dbName, tablesDataFrame.Count()));
|
||||
tablesDataFrame.Show();
|
||||
|
||||
session.Sql(string.Format("SELECT * FROM {0}", tableName)).Show(); // select from table
|
||||
|
||||
// Following example is for the deprecated API
|
||||
/*
|
||||
var sparkConf = new SparkConf();
|
||||
// The following setting is required to use Spark 2.0 in Windows
|
||||
// It may be provided in command line when running Mobius app
|
||||
//sparkConf.Set("spark.sql.warehouse.dir", @"<hdfs or local path>");
|
||||
var sparkContext = new SparkContext(sparkConf);
|
||||
var hiveContext = new HiveContext(sparkContext);
|
||||
var peopleDataFrame = hiveContext.Read().Json(jsonFilePath);
|
||||
|
||||
hiveContext.Sql(string.Format("CREATE DATABASE IF NOT EXISTS {0}", dbName)); // create database if not exists
|
||||
hiveContext.Sql(string.Format("USE {0}", dbName));
|
||||
hiveContext.Sql(string.Format("DROP TABLE {0}", tableName)); // drop table if exists
|
||||
//hiveContext.Sql(string.Format("DROP TABLE {0}", tableName)); // drop table if exists
|
||||
|
||||
peopleDataFrame.Write().Mode(SaveMode.Overwrite).SaveAsTable(tableName); // create table
|
||||
var tablesDataFrame = hiveContext.Tables(dbName); // get all tables in database
|
||||
|
@ -42,6 +61,7 @@ namespace Microsoft.Spark.CSharp.Examples
|
|||
tablesDataFrame.Show();
|
||||
|
||||
hiveContext.Sql(string.Format("SELECT * FROM {0}", tableName)).Show(); // select from table
|
||||
*/
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<packages>
|
||||
<package id="log4net" version="2.0.5" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net452" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net452" />
|
||||
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
|
||||
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
|
||||
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />
|
||||
|
|
|
@ -34,17 +34,17 @@
|
|||
<WarningLevel>4</WarningLevel>
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<Reference Include="CSharpWorker, Version=1.5.2.0, Culture=neutral, processorArchitecture=MSIL">
|
||||
<Reference Include="CSharpWorker">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="log4net, Version=1.2.15.0, Culture=neutral, PublicKeyToken=669e0ddf0bb1aa2a, processorArchitecture=MSIL">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Microsoft.Spark.CSharp.Adapter, Version=1.5.2.0, Culture=neutral, processorArchitecture=MSIL">
|
||||
<Reference Include="Microsoft.Spark.CSharp.Adapter">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Newtonsoft.Json, Version=4.5.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
|
@ -65,7 +65,7 @@
|
|||
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe.config">
|
||||
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe.config">
|
||||
<Link>CSharpWorker.exe.config</Link>
|
||||
</None>
|
||||
<None Include="..\..\App.config">
|
||||
|
|
|
@ -4,5 +4,5 @@
|
|||
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
|
||||
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
|
||||
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
|
||||
</packages>
|
||||
|
|
|
@ -34,17 +34,17 @@
|
|||
<WarningLevel>4</WarningLevel>
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<Reference Include="CSharpWorker, Version=1.5.2.0, Culture=neutral, processorArchitecture=MSIL">
|
||||
<Reference Include="CSharpWorker">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="log4net, Version=1.2.15.0, Culture=neutral, PublicKeyToken=669e0ddf0bb1aa2a, processorArchitecture=MSIL">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Microsoft.Spark.CSharp.Adapter, Version=1.5.2.0, Culture=neutral, processorArchitecture=MSIL">
|
||||
<Reference Include="Microsoft.Spark.CSharp.Adapter">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Newtonsoft.Json, Version=4.5.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
|
@ -65,7 +65,7 @@
|
|||
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe.config">
|
||||
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe.config">
|
||||
<Link>CSharpWorker.exe.config</Link>
|
||||
</None>
|
||||
<None Include="..\..\App.config">
|
||||
|
|
|
@ -4,5 +4,5 @@
|
|||
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
|
||||
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
|
||||
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
|
||||
</packages>
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<Reference Include="CSharpWorker">
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="log4net, Version=1.2.15.0, Culture=neutral, PublicKeyToken=669e0ddf0bb1aa2a, processorArchitecture=MSIL">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
|
@ -43,7 +43,7 @@
|
|||
</Reference>
|
||||
<Reference Include="Microsoft.CSharp" />
|
||||
<Reference Include="Microsoft.Spark.CSharp.Adapter">
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Newtonsoft.Json, Version=4.5.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
|
@ -68,7 +68,7 @@
|
|||
<Compile Include="EventPublisher.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe.config">
|
||||
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe.config">
|
||||
<Link>CSharpWorker.exe.config</Link>
|
||||
</None>
|
||||
<None Include="..\..\App.config">
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
|
||||
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
|
||||
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
|
||||
|
||||
<!-- *** ****************************************************************** *** -->
|
||||
<!-- *** Following references are needed for publishing events to EventHubs *** -->
|
||||
|
|
|
@ -38,7 +38,7 @@
|
|||
</Reference>
|
||||
<Reference Include="CSharpWorker">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="log4net">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
|
@ -46,7 +46,7 @@
|
|||
</Reference>
|
||||
<Reference Include="Microsoft.Spark.CSharp.Adapter">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Razorvine.Pyrolite">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
|
@ -64,7 +64,7 @@
|
|||
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe.config">
|
||||
<None Include="..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe.config">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Include="..\..\App.config">
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<packages>
|
||||
<package id="log4net" version="2.0.5" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
|
||||
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
|
||||
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
|
||||
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />
|
||||
|
|
|
@ -33,14 +33,14 @@
|
|||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<Reference Include="CSharpWorker">
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="log4net, Version=1.2.10.0, Culture=neutral, PublicKeyToken=1b44e1d426115821, processorArchitecture=MSIL">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
<HintPath>..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Microsoft.Spark.CSharp.Adapter">
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Newtonsoft.Json, Version=4.5.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
|
||||
<SpecificVersion>False</SpecificVersion>
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<packages>
|
||||
<package id="log4net" version="2.0.5" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
|
||||
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
|
||||
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
|
||||
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />
|
||||
|
|
|
@ -66,13 +66,13 @@
|
|||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Reference Include="CSharpWorker">
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="log4net">
|
||||
<HintPath>..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Microsoft.Spark.CSharp.Adapter">
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="mscorlib" />
|
||||
<Reference Include="FSharp.Core, Version=$(TargetFSharpCoreVersion), Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a">
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
<packages>
|
||||
<package id="FSharp.Core" version="4.0.0.1" targetFramework="net45" />
|
||||
<package id="log4net" version="2.0.5" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
|
||||
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
|
||||
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
|
||||
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />
|
||||
|
|
|
@ -71,7 +71,7 @@
|
|||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Reference Include="CSharpWorker">
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\CSharpWorker.exe</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\CSharpWorker.exe</HintPath>
|
||||
<Private>True</Private>
|
||||
</Reference>
|
||||
<Reference Include="FSharp.Core">
|
||||
|
@ -83,7 +83,7 @@
|
|||
<Private>True</Private>
|
||||
</Reference>
|
||||
<Reference Include="Microsoft.Spark.CSharp.Adapter">
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-1\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
<HintPath>..\..\packages\Microsoft.SparkCLR.2.0.0-PREVIEW-2\lib\net45\Microsoft.Spark.CSharp.Adapter.dll</HintPath>
|
||||
<Private>True</Private>
|
||||
</Reference>
|
||||
<Reference Include="mscorlib" />
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
<packages>
|
||||
<package id="FSharp.Core" version="4.0.0.1" targetFramework="net45" />
|
||||
<package id="log4net" version="2.0.5" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-1" targetFramework="net45" />
|
||||
<package id="Microsoft.SparkCLR" version="2.0.0-PREVIEW-2" targetFramework="net45" />
|
||||
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
|
||||
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
|
||||
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />
|
||||
|
|
|
@ -6,7 +6,8 @@
|
|||
* Maven 3.0.5 or above.
|
||||
* Mono 4.2 stable or above. The download and installation instructions for Mono are available in [http://www.mono-project.com/download/#download-lin](http://www.mono-project.com/download/#download-lin) (see [Debian, Ubuntu and derivatives](http://www.mono-project.com/docs/getting-started/install/linux/#debian-ubuntu-and-derivatives) or [CentOS, Fedora, similar Linux distributions or OS X](http://www.mono-project.com/docs/getting-started/install/linux/#centos-7-fedora-19-and-later-and-derivatives))
|
||||
* F# for Mono. The download and installation instructions for the F# Mono extension are available in [http://fsharp.org/use/linux/](http://fsharp.org/use/linux/)
|
||||
* NuGet.
|
||||
* NuGet
|
||||
* wget
|
||||
* XSLTPROC
|
||||
|
||||
The following environment variables should be set properly:
|
||||
|
@ -15,9 +16,10 @@ The following environment variables should be set properly:
|
|||
|
||||
## Instructions
|
||||
|
||||
Instructions to build Mobius in Linux are same as [instructions for Windows](./windows-instructions.md#instructions). The only change required is to use the following script files instead of .cmd files:
|
||||
* build.sh
|
||||
* clean.sh
|
||||
Instructions to build Mobius on Linux are same as [instructions for Windows](./windows-instructions.md#instructions). The only change required is to use the following script files instead of `.cmd` files:
|
||||
|
||||
* `build.sh`
|
||||
* `clean.sh`
|
||||
|
||||
# Running Unit Tests in Linux
|
||||
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
// *** Replace the paths below to point to correct location of Mobius binaries ***
|
||||
#r @"C:\spark-clr_2.11-2.0.000\runtime\bin\Microsoft.Spark.CSharp.Adapter.dll"
|
||||
#r @"C:\spark-clr_2.11-2.0.000\runtime\bin\log4net.dll"
|
||||
#r @"C:\spark-clr_2.11-2.0.000\runtime\bin\Newtonsoft.Json.dll"
|
||||
#r @"C:\spark-clr_2.11-2.0.000\runtime\bin\Razorvine.Pyrolite.dll"
|
||||
#r @"C:\spark-clr_2.11-2.0.000\runtime\bin\Razorvine.Serpent.dll"
|
||||
#r @"C:\spark-clr_2.11-2.0.000\runtime\bin\CSharpWorker.exe"
|
||||
open Microsoft.Spark.CSharp.Core
|
||||
open Microsoft.Spark.CSharp.Services
|
||||
open Microsoft.Spark.CSharp.Sql
|
||||
open System.Reflection
|
||||
open System.Collections.Generic
|
||||
LoggerServiceFactory.SetLoggerService Log4NetLoggerService.Instance
|
||||
|
||||
// *** Uncomment & use the following code block to use SqlContext API ***
|
||||
//let conf = SparkConf().SetAppName "FSharpInteractiveShell"
|
||||
// *** uncomment & update master URL if running in non-local mode ***
|
||||
//conf.Master "spark://sparkmaster:7077"
|
||||
// *** Spark 2.0 in Windows requires the following config ***
|
||||
//conf.Set("spark.sql.warehouse.dir", @"file:///C:/sparktemp")
|
||||
//let sc = SparkContext conf
|
||||
//let sqlContext = SqlContext sc
|
||||
|
||||
// *** Uncomment & use the following code block to use SparkSession API ***
|
||||
let builder = SparkSession.Builder()
|
||||
builder = builder.AppName "FSharpInteractiveShell"
|
||||
// *** uncomment & update master URL if running in non-local mode ***
|
||||
//builder = builder.Master "spark://sparkmaster:7077"
|
||||
// *** Spark 2.0 in Windows requires the following config ***
|
||||
builder = builder.Config("spark.sql.warehouse.dir", "file:///C:/sparktemp")
|
||||
let session = builder.GetOrCreate()
|
|
@ -0,0 +1,17 @@
|
|||
# Implementing Spark Apps in F# using Mobius
|
||||
|
||||
## Non-Interactive Apps
|
||||
1. Develop your application in a F# IDE using Mobius API. Refer to [F# examples](../examples/fsharp) for sample code
|
||||
2. Use [`sparkclr-submit.cmd`](running-mobius-app.md) to run your Mobius-based Spark application implemented in F#
|
||||
|
||||
## Interactive Apps
|
||||
### Using F# Interactive (fsi.exe)
|
||||
1. Run `sparkclr-submit.cmd debug` in a command prompt after setting necessary [environment variables](running-mobius-app.md#pre-requisites). Note that this `debug` parameter is a misnomer in this context and this command initializes .NET-JVM bridge similiar to [running Mobius apps in debug mode](./running-mobius-app.md#debug-mode).
|
||||
2. In Developer Command Prompt for VS, run `fsi.exe --use:c:\temp\mobius-init.fsx`. [mobius-init.fsx](mobius-init.fsx) has the initialization code that can be used to create `SparkContext`, `SqlContext` or `SparkSession`. You need to update the location of Mobius binaries referenced in the beginning of the script file. You may also need to update other configuration settings in the script.
|
||||
3. When the F# command prompt is available, Spark functionality can be invoked using Mobius API. For example, the following code can be used process JSON file.
|
||||
```
|
||||
let dataframe = sparkSession.Read().Json @"C:\temp\data.json";;
|
||||
dataframe.Show();;
|
||||
dataframe.ShowSchema();;
|
||||
dataframe.Count();;
|
||||
```
|
Загрузка…
Ссылка в новой задаче