зеркало из https://github.com/dotnet/spark.git
feat: add base classes for ML and refine code base (#1031)
This commit is contained in:
Родитель
c89bd283f5
Коммит
b2fa3508d7
|
@ -25,7 +25,7 @@ namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
|
|||
/// <param name="paramName">The name of a parameter that can be set on this object</param>
|
||||
/// <param name="paramValue">A parameter value that can be set on this object</param>
|
||||
public void TestFeatureBase(
|
||||
FeatureBase<T> testObject,
|
||||
Params testObject,
|
||||
string paramName,
|
||||
object paramValue)
|
||||
{
|
||||
|
@ -37,8 +37,8 @@ namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
|
|||
Assert.Equal(param.Parent, testObject.Uid());
|
||||
|
||||
Assert.NotEmpty(testObject.ExplainParam(param));
|
||||
testObject.Set(param, paramValue);
|
||||
Assert.IsAssignableFrom<Identifiable>(testObject.Clear(param));
|
||||
testObject.Set<T>(param, paramValue);
|
||||
Assert.IsAssignableFrom<Identifiable>(testObject.Clear<T>(param));
|
||||
|
||||
Assert.IsType<string>(testObject.Uid());
|
||||
}
|
||||
|
|
|
@ -0,0 +1,79 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.IO;
|
||||
using Microsoft.Spark.ML.Feature;
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.UnitTest.TestUtils;
|
||||
using Microsoft.Spark.Sql.Types;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
|
||||
{
|
||||
[Collection("Spark E2E Tests")]
|
||||
public class PipelineModelTests : FeatureBaseTests<PipelineModel>
|
||||
{
|
||||
private readonly SparkSession _spark;
|
||||
|
||||
public PipelineModelTests(SparkFixture fixture) : base(fixture)
|
||||
{
|
||||
_spark = fixture.Spark;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a <see cref="PipelineModel"/> and test the
|
||||
/// available methods.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void TestPipelineModelTransform()
|
||||
{
|
||||
var expectedSplits =
|
||||
new double[] { double.MinValue, 0.0, 10.0, 50.0, double.MaxValue };
|
||||
|
||||
string expectedHandle = "skip";
|
||||
string expectedUid = "uid";
|
||||
string expectedInputCol = "input_col";
|
||||
string expectedOutputCol = "output_col";
|
||||
|
||||
var bucketizer = new Bucketizer(expectedUid);
|
||||
bucketizer.SetInputCol(expectedInputCol)
|
||||
.SetOutputCol(expectedOutputCol)
|
||||
.SetHandleInvalid(expectedHandle)
|
||||
.SetSplits(expectedSplits);
|
||||
|
||||
var stages = new JavaTransformer[] {
|
||||
bucketizer
|
||||
};
|
||||
|
||||
PipelineModel pipelineModel = new PipelineModel("randomUID", stages);
|
||||
|
||||
DataFrame input = _spark.Sql("SELECT ID as input_col from range(100)");
|
||||
|
||||
DataFrame output = pipelineModel.Transform(input);
|
||||
Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol));
|
||||
|
||||
Assert.Equal(expectedInputCol, bucketizer.GetInputCol());
|
||||
Assert.Equal(expectedOutputCol, bucketizer.GetOutputCol());
|
||||
Assert.Equal(expectedSplits, bucketizer.GetSplits());
|
||||
|
||||
Assert.IsType<StructType>(pipelineModel.TransformSchema(input.Schema()));
|
||||
Assert.IsType<DataFrame>(output);
|
||||
|
||||
using (var tempDirectory = new TemporaryDirectory())
|
||||
{
|
||||
string savePath = Path.Join(tempDirectory.Path, "pipelineModel");
|
||||
pipelineModel.Save(savePath);
|
||||
|
||||
PipelineModel loadedPipelineModel = PipelineModel.Load(savePath);
|
||||
Assert.Equal(pipelineModel.Uid(), loadedPipelineModel.Uid());
|
||||
|
||||
string writePath = Path.Join(tempDirectory.Path, "pipelineModelWithWrite");
|
||||
pipelineModel.Write().Save(writePath);
|
||||
|
||||
PipelineModel loadedPipelineModelWithRead = pipelineModel.Read().Load(writePath);
|
||||
Assert.Equal(pipelineModel.Uid(), loadedPipelineModelWithRead.Uid());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,111 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.IO;
|
||||
using Microsoft.Spark.ML.Feature;
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.UnitTest.TestUtils;
|
||||
using Microsoft.Spark.Sql.Types;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
|
||||
{
|
||||
[Collection("Spark E2E Tests")]
|
||||
public class PipelineTests : FeatureBaseTests<Pipeline>
|
||||
{
|
||||
private readonly SparkSession _spark;
|
||||
|
||||
public PipelineTests(SparkFixture fixture) : base(fixture)
|
||||
{
|
||||
_spark = fixture.Spark;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a <see cref="Pipeline"/> and test the
|
||||
/// available methods. Test the FeatureBase methods
|
||||
/// using <see cref="TestFeatureBase"/>.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void TestPipeline()
|
||||
{
|
||||
var stages = new JavaPipelineStage[] {
|
||||
new Bucketizer(),
|
||||
new CountVectorizer()
|
||||
};
|
||||
|
||||
Pipeline pipeline = new Pipeline()
|
||||
.SetStages(stages);
|
||||
JavaPipelineStage[] returnStages = pipeline.GetStages();
|
||||
|
||||
Assert.Equal(stages[0].Uid(), returnStages[0].Uid());
|
||||
Assert.Equal(stages[0].ToString(), returnStages[0].ToString());
|
||||
Assert.Equal(stages[1].Uid(), returnStages[1].Uid());
|
||||
Assert.Equal(stages[1].ToString(), returnStages[1].ToString());
|
||||
|
||||
using (var tempDirectory = new TemporaryDirectory())
|
||||
{
|
||||
string savePath = Path.Join(tempDirectory.Path, "pipeline");
|
||||
pipeline.Save(savePath);
|
||||
|
||||
Pipeline loadedPipeline = Pipeline.Load(savePath);
|
||||
Assert.Equal(pipeline.Uid(), loadedPipeline.Uid());
|
||||
}
|
||||
|
||||
TestFeatureBase(pipeline, "stages", stages);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a <see cref="Pipeline"/> and test the
|
||||
/// fit and read/write methods.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void TestPipelineFit()
|
||||
{
|
||||
DataFrame input = _spark.Sql("SELECT array('hello', 'I', 'AM', 'a', 'string', 'TO', " +
|
||||
"'TOKENIZE') as input from range(100)");
|
||||
|
||||
const string inputColumn = "input";
|
||||
const string outputColumn = "output";
|
||||
const double minDf = 1;
|
||||
const double minTf = 10;
|
||||
const int vocabSize = 10000;
|
||||
|
||||
CountVectorizer countVectorizer = new CountVectorizer()
|
||||
.SetInputCol(inputColumn)
|
||||
.SetOutputCol(outputColumn)
|
||||
.SetMinDF(minDf)
|
||||
.SetMinTF(minTf)
|
||||
.SetVocabSize(vocabSize);
|
||||
|
||||
var stages = new JavaPipelineStage[] {
|
||||
countVectorizer
|
||||
};
|
||||
|
||||
Pipeline pipeline = new Pipeline().SetStages(stages);
|
||||
PipelineModel pipelineModel = pipeline.Fit(input);
|
||||
|
||||
DataFrame output = pipelineModel.Transform(input);
|
||||
|
||||
Assert.IsType<StructType>(pipelineModel.TransformSchema(input.Schema()));
|
||||
Assert.IsType<DataFrame>(output);
|
||||
|
||||
using (var tempDirectory = new TemporaryDirectory())
|
||||
{
|
||||
string savePath = Path.Join(tempDirectory.Path, "pipeline");
|
||||
pipeline.Save(savePath);
|
||||
|
||||
Pipeline loadedPipeline = Pipeline.Load(savePath);
|
||||
Assert.Equal(pipeline.Uid(), loadedPipeline.Uid());
|
||||
|
||||
string writePath = Path.Join(tempDirectory.Path, "pipelineWithWrite");
|
||||
pipeline.Write().Save(writePath);
|
||||
|
||||
Pipeline loadedPipelineWithRead = pipeline.Read().Load(writePath);
|
||||
Assert.Equal(pipeline.Uid(), loadedPipelineWithRead.Uid());
|
||||
}
|
||||
|
||||
TestFeatureBase(pipeline, "stages", stages);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Microsoft.Spark.Interop;
|
||||
using Microsoft.Spark.Interop.Internal.Java.Util;
|
||||
|
||||
namespace System
|
||||
{
|
||||
/// <summary>
|
||||
/// ArrayExtensions host custom extension methods for the
|
||||
/// dotnet base class array T[].
|
||||
/// </summary>
|
||||
public static class ArrayExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// A custom extension method that helps transform from dotnet
|
||||
/// array of type T to java.util.ArrayList.
|
||||
/// </summary>
|
||||
/// <param name="array">an array instance</param>
|
||||
/// <typeparam name="T">elements type of param array</typeparam>
|
||||
/// <returns><see cref="ArrayList"/></returns>
|
||||
internal static ArrayList ToJavaArrayList<T>(this T[] array)
|
||||
{
|
||||
var arrayList = new ArrayList(SparkEnvironment.JvmBridge);
|
||||
foreach (T item in array)
|
||||
{
|
||||
arrayList.Add(item);
|
||||
}
|
||||
return arrayList;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Microsoft.Spark.Interop;
|
||||
using Microsoft.Spark.Interop.Internal.Java.Util;
|
||||
|
||||
namespace System.Collections.Generic
|
||||
{
|
||||
public static class Dictionary
|
||||
{
|
||||
/// <summary>
|
||||
/// A custom extension method that helps transform from dotnet
|
||||
/// Dictionary<string, string> to java.util.HashMap.
|
||||
/// </summary>
|
||||
/// <param name="dictionary">a Dictionary instance</param>
|
||||
/// <returns><see cref="HashMap"/></returns>
|
||||
internal static HashMap ToJavaHashMap(this Dictionary<string, string> dictionary)
|
||||
{
|
||||
var hashMap = new HashMap(SparkEnvironment.JvmBridge);
|
||||
foreach (KeyValuePair<string, string> item in dictionary)
|
||||
{
|
||||
hashMap.Put(item.Key, item.Value);
|
||||
}
|
||||
return hashMap;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A custom extension method that helps transform from dotnet
|
||||
/// Dictionary<string, object> to java.util.HashMap.
|
||||
/// </summary>
|
||||
/// <param name="dictionary">a Dictionary instance</param>
|
||||
/// <returns><see cref="HashMap"/></returns>
|
||||
internal static HashMap ToJavaHashMap(this Dictionary<string, object> dictionary)
|
||||
{
|
||||
var hashMap = new HashMap(SparkEnvironment.JvmBridge);
|
||||
foreach (KeyValuePair<string, object> item in dictionary)
|
||||
{
|
||||
hashMap.Put(item.Key, item.Value);
|
||||
}
|
||||
return hashMap;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
|
||||
namespace Microsoft.Spark.Interop.Internal.Java.Util
|
||||
{
|
||||
/// <summary>
|
||||
/// HashMap class represents a <c>java.util.HashMap</c> object.
|
||||
/// </summary>
|
||||
internal sealed class HashMap : IJvmObjectReferenceProvider
|
||||
{
|
||||
/// <summary>
|
||||
/// Create a <c>java.util.HashMap</c> JVM object
|
||||
/// </summary>
|
||||
/// <param name="jvm">JVM bridge to use</param>
|
||||
internal HashMap(IJvmBridge jvm) =>
|
||||
Reference = jvm.CallConstructor("java.util.HashMap");
|
||||
|
||||
public JvmObjectReference Reference { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Associates the specified value with the specified key in this map.
|
||||
/// If the map previously contained a mapping for the key, the old value is replaced.
|
||||
/// </summary>
|
||||
/// <param name="key">key with which the specified value is to be associated</param>
|
||||
/// <param name="value">value to be associated with the specified key</param>
|
||||
internal void Put(object key, object value) =>
|
||||
Reference.Invoke("put", key, value);
|
||||
|
||||
/// <summary>
|
||||
/// Returns the value to which the specified key is mapped,
|
||||
/// or null if this map contains no mapping for the key.
|
||||
/// </summary>
|
||||
/// <param name="key">value whose presence in this map is to be tested</param>
|
||||
/// <return>value associated with the specified key</return>
|
||||
internal object Get(object key) =>
|
||||
Reference.Invoke("get", key);
|
||||
|
||||
/// <summary>
|
||||
/// Returns true if this map maps one or more keys to the specified value.
|
||||
/// </summary>
|
||||
/// <param name="value">The HashMap key</param>
|
||||
/// <return>true if this map maps one or more keys to the specified value</return>
|
||||
internal bool ContainsValue(object value) =>
|
||||
(bool)Reference.Invoke("containsValue", value);
|
||||
|
||||
/// <summary>
|
||||
/// Returns an array of the keys contained in this map.
|
||||
/// </summary>
|
||||
/// <return>An array of object hosting the keys contained in the map</return>
|
||||
internal object[] Keys()
|
||||
{
|
||||
var jvmObject = (JvmObjectReference)Reference.Invoke("keySet");
|
||||
var result = (object[])jvmObject.Invoke("toArray");
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,4 +1,7 @@
|
|||
using System;
|
||||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.Linq;
|
||||
using System.Reflection;
|
||||
using Microsoft.Spark.Interop;
|
||||
|
@ -7,27 +10,23 @@ using Microsoft.Spark.Interop.Ipc;
|
|||
namespace Microsoft.Spark.ML.Feature
|
||||
{
|
||||
/// <summary>
|
||||
/// FeatureBase is to share code amongst all of the ML.Feature objects, there are a few
|
||||
/// interfaces that the Scala code implements across all of the objects. This should help to
|
||||
/// write the extra objects faster.
|
||||
/// Params is used for components that take parameters. This also provides
|
||||
/// an internal param map to store parameter values attached to the instance.
|
||||
/// An abstract class corresponds to scala's Params trait.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">
|
||||
/// The class that implements FeatureBase, this is needed so we can create new objects where
|
||||
/// spark returns new objects rather than update existing objects.
|
||||
/// </typeparam>
|
||||
public class FeatureBase<T> : Identifiable, IJvmObjectReferenceProvider
|
||||
{
|
||||
internal FeatureBase(string className)
|
||||
public abstract class Params : Identifiable, IJvmObjectReferenceProvider
|
||||
{
|
||||
internal Params(string className)
|
||||
: this(SparkEnvironment.JvmBridge.CallConstructor(className))
|
||||
{
|
||||
}
|
||||
|
||||
internal FeatureBase(string className, string uid)
|
||||
|
||||
internal Params(string className, string uid)
|
||||
: this(SparkEnvironment.JvmBridge.CallConstructor(className, uid))
|
||||
{
|
||||
}
|
||||
|
||||
internal FeatureBase(JvmObjectReference jvmObject)
|
||||
|
||||
internal Params(JvmObjectReference jvmObject)
|
||||
{
|
||||
Reference = jvmObject;
|
||||
}
|
||||
|
@ -39,7 +38,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <returns>JVM toString() value</returns>
|
||||
public override string ToString() => (string)Reference.Invoke("toString");
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// The UID that was used to create the object. If no UID is passed in when creating the
|
||||
/// object then a random UID is created when the object is created.
|
||||
|
@ -47,30 +46,12 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <returns>string UID identifying the object</returns>
|
||||
public string Uid() => (string)Reference.Invoke("uid");
|
||||
|
||||
/// <summary>
|
||||
/// Saves the object so that it can be loaded later using Load. Note that these objects
|
||||
/// can be shared with Scala by Loading or Saving in Scala.
|
||||
/// </summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
/// <returns>New object</returns>
|
||||
public T Save(string path) =>
|
||||
WrapAsType((JvmObjectReference)Reference.Invoke("save", path));
|
||||
|
||||
/// <summary>
|
||||
/// Clears any value that was previously set for this <see cref="Param"/>. The value is
|
||||
/// reset to the default value.
|
||||
/// </summary>
|
||||
/// <param name="param">The <see cref="Param"/> to set back to its original value</param>
|
||||
/// <returns>Object reference that was used to clear the <see cref="Param"/></returns>
|
||||
public T Clear(Param.Param param) =>
|
||||
WrapAsType((JvmObjectReference)Reference.Invoke("clear", param));
|
||||
|
||||
/// <summary>
|
||||
/// Returns a description of how a specific <see cref="Param"/> works and is currently set.
|
||||
/// </summary>
|
||||
/// <param name="param">The <see cref="Param"/> to explain</param>
|
||||
/// <returns>Description of the <see cref="Param"/></returns>
|
||||
public string ExplainParam(Param.Param param) =>
|
||||
public string ExplainParam(Param.Param param) =>
|
||||
(string)Reference.Invoke("explainParam", param);
|
||||
|
||||
/// <summary>
|
||||
|
@ -80,13 +61,30 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <returns>Description of all the applicable <see cref="Param"/>'s</returns>
|
||||
public string ExplainParams() => (string)Reference.Invoke("explainParams");
|
||||
|
||||
/// <summary>Checks whether a param is explicitly set.</summary>
|
||||
/// <param name="param">The <see cref="Param"/> to be checked.</param>
|
||||
/// <returns>bool</returns>
|
||||
public bool IsSet(Param.Param param) => (bool)Reference.Invoke("isSet", param);
|
||||
|
||||
/// <summary>Checks whether a param is explicitly set or has a default value.</summary>
|
||||
/// <param name="param">The <see cref="Param"/> to be checked.</param>
|
||||
/// <returns>bool</returns>
|
||||
public bool IsDefined(Param.Param param) => (bool)Reference.Invoke("isDefined", param);
|
||||
|
||||
/// <summary>
|
||||
/// Tests whether this instance contains a param with a given name.
|
||||
/// </summary>
|
||||
/// <param name="paramName">The <see cref="Param"/> to be test.</param>
|
||||
/// <returns>bool</returns>
|
||||
public bool HasParam(string paramName) => (bool)Reference.Invoke("hasParam", paramName);
|
||||
|
||||
/// <summary>
|
||||
/// Retrieves a <see cref="Param"/> so that it can be used to set the value of the
|
||||
/// <see cref="Param"/> on the object.
|
||||
/// </summary>
|
||||
/// <param name="paramName">The name of the <see cref="Param"/> to get.</param>
|
||||
/// <returns><see cref="Param"/> that can be used to set the actual value</returns>
|
||||
public Param.Param GetParam(string paramName) =>
|
||||
public Param.Param GetParam(string paramName) =>
|
||||
new Param.Param((JvmObjectReference)Reference.Invoke("getParam", paramName));
|
||||
|
||||
/// <summary>
|
||||
|
@ -95,10 +93,19 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <param name="param"><see cref="Param"/> to set the value of</param>
|
||||
/// <param name="value">The value to use</param>
|
||||
/// <returns>The object that contains the newly set <see cref="Param"/></returns>
|
||||
public T Set(Param.Param param, object value) =>
|
||||
WrapAsType((JvmObjectReference)Reference.Invoke("set", param, value));
|
||||
public T Set<T>(Param.Param param, object value) =>
|
||||
WrapAsType<T>((JvmObjectReference)Reference.Invoke("set", param, value));
|
||||
|
||||
private static T WrapAsType(JvmObjectReference reference)
|
||||
/// <summary>
|
||||
/// Clears any value that was previously set for this <see cref="Param"/>. The value is
|
||||
/// reset to the default value.
|
||||
/// </summary>
|
||||
/// <param name="param">The <see cref="Param"/> to set back to its original value</param>
|
||||
/// <returns>Object reference that was used to clear the <see cref="Param"/></returns>
|
||||
public T Clear<T>(Param.Param param) =>
|
||||
WrapAsType<T>((JvmObjectReference)Reference.Invoke("clear", param));
|
||||
|
||||
protected static T WrapAsType<T>(JvmObjectReference reference)
|
||||
{
|
||||
ConstructorInfo constructor = typeof(T)
|
||||
.GetConstructors(BindingFlags.NonPublic | BindingFlags.Instance)
|
||||
|
@ -109,7 +116,32 @@ namespace Microsoft.Spark.ML.Feature
|
|||
(parameters[0].ParameterType == typeof(JvmObjectReference));
|
||||
});
|
||||
|
||||
return (T)constructor.Invoke(new object[] {reference});
|
||||
return (T)constructor.Invoke(new object[] { reference });
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DotnetUtils is used to hold basic general helper functions that
|
||||
/// are used within ML scope.
|
||||
/// </summary>
|
||||
internal class DotnetUtils
|
||||
{
|
||||
/// <summary>
|
||||
/// Helper function for getting the exact class name from jvm object.
|
||||
/// </summary>
|
||||
/// <param name="jvmObject">The reference to object created in JVM.</param>
|
||||
/// <returns>A string Tuple2 of constructor class name and method name</returns>
|
||||
internal static (string, string) GetUnderlyingType(JvmObjectReference jvmObject)
|
||||
{
|
||||
var jvmClass = (JvmObjectReference)jvmObject.Invoke("getClass");
|
||||
var returnClass = (string)jvmClass.Invoke("getTypeName");
|
||||
string[] dotnetClass = returnClass.Replace("com.microsoft.azure.synapse.ml", "Synapse.ML")
|
||||
.Replace("org.apache.spark.ml", "Microsoft.Spark.ML")
|
||||
.Split(".".ToCharArray());
|
||||
string[] renameClass = dotnetClass.Select(x => char.ToUpper(x[0]) + x.Substring(1)).ToArray();
|
||||
string constructorClass = string.Join(".", renameClass);
|
||||
string methodName = "WrapAs" + dotnetClass[dotnetClass.Length - 1];
|
||||
return (constructorClass, methodName);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -2,7 +2,6 @@
|
|||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Microsoft.Spark.Interop;
|
||||
|
@ -20,11 +19,14 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// will be thrown. The splits parameter is only used for single column usage, and splitsArray
|
||||
/// is for multiple columns.
|
||||
/// </summary>
|
||||
public class Bucketizer : FeatureBase<Bucketizer>
|
||||
public class Bucketizer :
|
||||
JavaModel<Bucketizer>,
|
||||
IJavaMLWritable,
|
||||
IJavaMLReadable<Bucketizer>
|
||||
{
|
||||
private static readonly string s_bucketizerClassName =
|
||||
private static readonly string s_bucketizerClassName =
|
||||
"org.apache.spark.ml.feature.Bucketizer";
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Create a <see cref="Bucketizer"/> without any parameters
|
||||
/// </summary>
|
||||
|
@ -40,11 +42,11 @@ namespace Microsoft.Spark.ML.Feature
|
|||
public Bucketizer(string uid) : base(s_bucketizerClassName, uid)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
internal Bucketizer(JvmObjectReference jvmObject) : base(jvmObject)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the splits that were set using SetSplits
|
||||
/// </summary>
|
||||
|
@ -62,7 +64,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// increasing. Values outside the splits specified will be treated as errors.
|
||||
/// </param>
|
||||
/// <returns>New <see cref="Bucketizer"/> object</returns>
|
||||
public Bucketizer SetSplits(double[] value) =>
|
||||
public Bucketizer SetSplits(double[] value) =>
|
||||
WrapAsBucketizer(Reference.Invoke("setSplits", value));
|
||||
|
||||
/// <summary>
|
||||
|
@ -82,7 +84,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// includes y. The splits should be of length >= 3 and strictly increasing.
|
||||
/// Values outside the splits specified will be treated as errors.</param>
|
||||
/// <returns>New <see cref="Bucketizer"/> object</returns>
|
||||
public Bucketizer SetSplitsArray(double[][] value) =>
|
||||
public Bucketizer SetSplitsArray(double[][] value) =>
|
||||
WrapAsBucketizer(Reference.Invoke("setSplitsArray", (object)value));
|
||||
|
||||
/// <summary>
|
||||
|
@ -98,15 +100,15 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">The name of the column to as the source of the buckets</param>
|
||||
/// <returns>New <see cref="Bucketizer"/> object</returns>
|
||||
public Bucketizer SetInputCol(string value) =>
|
||||
public Bucketizer SetInputCol(string value) =>
|
||||
WrapAsBucketizer(Reference.Invoke("setInputCol", value));
|
||||
|
||||
/// <summary>
|
||||
/// Gets the columns that <see cref="Bucketizer"/> should read from and convert into
|
||||
/// buckets. This is set by SetInputCol
|
||||
/// </summary>
|
||||
/// <returns>IEnumerable<string>, list of input columns</returns>
|
||||
public IEnumerable<string> GetInputCols() =>
|
||||
/// <returns>IEnumerable<string>, list of input columns</returns>
|
||||
public IEnumerable<string> GetInputCols() =>
|
||||
((string[])(Reference.Invoke("getInputCols"))).ToList();
|
||||
|
||||
/// <summary>
|
||||
|
@ -118,7 +120,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">List of input columns to use as sources for buckets</param>
|
||||
/// <returns>New <see cref="Bucketizer"/> object</returns>
|
||||
public Bucketizer SetInputCols(IEnumerable<string> value) =>
|
||||
public Bucketizer SetInputCols(IEnumerable<string> value) =>
|
||||
WrapAsBucketizer(Reference.Invoke("setInputCols", value));
|
||||
|
||||
/// <summary>
|
||||
|
@ -134,7 +136,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">The name of the new column which contains the bucket ID</param>
|
||||
/// <returns>New <see cref="Bucketizer"/> object</returns>
|
||||
public Bucketizer SetOutputCol(string value) =>
|
||||
public Bucketizer SetOutputCol(string value) =>
|
||||
WrapAsBucketizer(Reference.Invoke("setOutputCol", value));
|
||||
|
||||
/// <summary>
|
||||
|
@ -142,7 +144,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// This is set by SetOutputCols
|
||||
/// </summary>
|
||||
/// <returns>IEnumerable<string>, list of output columns</returns>
|
||||
public IEnumerable<string> GetOutputCols() =>
|
||||
public IEnumerable<string> GetOutputCols() =>
|
||||
((string[])Reference.Invoke("getOutputCols")).ToList();
|
||||
|
||||
/// <summary>
|
||||
|
@ -150,7 +152,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">List of column names which will contain the bucket ID</param>
|
||||
/// <returns>New <see cref="Bucketizer"/> object</returns>
|
||||
public Bucketizer SetOutputCols(List<string> value) =>
|
||||
public Bucketizer SetOutputCols(List<string> value) =>
|
||||
WrapAsBucketizer(Reference.Invoke("setOutputCols", value));
|
||||
|
||||
/// <summary>
|
||||
|
@ -161,7 +163,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
public static Bucketizer Load(string path) =>
|
||||
WrapAsBucketizer(
|
||||
SparkEnvironment.JvmBridge.CallStaticJavaMethod(
|
||||
s_bucketizerClassName,"load", path));
|
||||
s_bucketizerClassName, "load", path));
|
||||
|
||||
/// <summary>
|
||||
/// Executes the <see cref="Bucketizer"/> and transforms the DataFrame to include the new
|
||||
|
@ -171,7 +173,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <returns>
|
||||
/// <see cref="DataFrame"/> containing the original data and the new bucketed columns
|
||||
/// </returns>
|
||||
public DataFrame Transform(DataFrame source) =>
|
||||
public override DataFrame Transform(DataFrame source) =>
|
||||
new DataFrame((JvmObjectReference)Reference.Invoke("transform", source));
|
||||
|
||||
/// <summary>
|
||||
|
@ -188,10 +190,31 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">"skip", "error" or "keep"</param>
|
||||
/// <returns>New <see cref="Bucketizer"/> object</returns>
|
||||
public Bucketizer SetHandleInvalid(string value) =>
|
||||
public Bucketizer SetHandleInvalid(string value) =>
|
||||
WrapAsBucketizer(Reference.Invoke("setHandleInvalid", value.ToString()));
|
||||
|
||||
private static Bucketizer WrapAsBucketizer(object obj) =>
|
||||
/// <summary>
|
||||
/// Saves the object so that it can be loaded later using Load. Note that these objects
|
||||
/// can be shared with Scala by Loading or Saving in Scala.
|
||||
/// </summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
public void Save(string path) => Reference.Invoke("save", path);
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLWriter instance.
|
||||
/// </summary>
|
||||
/// <returns>a <see cref="JavaMLWriter"/> instance for this ML instance.</returns>
|
||||
public JavaMLWriter Write() =>
|
||||
new JavaMLWriter((JvmObjectReference)Reference.Invoke("write"));
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLReader instance.
|
||||
/// </summary>
|
||||
/// <returns>an <see cref="JavaMLReader<Bucketizer>"/> instance for this ML instance.</returns>
|
||||
public JavaMLReader<Bucketizer> Read() =>
|
||||
new JavaMLReader<Bucketizer>((JvmObjectReference)Reference.Invoke("read"));
|
||||
|
||||
private static Bucketizer WrapAsBucketizer(object obj) =>
|
||||
new Bucketizer((JvmObjectReference)obj);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,11 +8,14 @@ using Microsoft.Spark.Sql;
|
|||
|
||||
namespace Microsoft.Spark.ML.Feature
|
||||
{
|
||||
public class CountVectorizer : FeatureBase<CountVectorizer>
|
||||
public class CountVectorizer :
|
||||
JavaEstimator<CountVectorizerModel>,
|
||||
IJavaMLWritable,
|
||||
IJavaMLReadable<CountVectorizer>
|
||||
{
|
||||
private static readonly string s_countVectorizerClassName =
|
||||
private static readonly string s_countVectorizerClassName =
|
||||
"org.apache.spark.ml.feature.CountVectorizer";
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Creates a <see cref="CountVectorizer"/> without any parameters.
|
||||
/// </summary>
|
||||
|
@ -28,7 +31,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
public CountVectorizer(string uid) : base(s_countVectorizerClassName, uid)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
internal CountVectorizer(JvmObjectReference jvmObject) : base(jvmObject)
|
||||
{
|
||||
}
|
||||
|
@ -36,7 +39,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <summary>Fits a model to the input data.</summary>
|
||||
/// <param name="dataFrame">The <see cref="DataFrame"/> to fit the model to.</param>
|
||||
/// <returns><see cref="CountVectorizerModel"/></returns>
|
||||
public CountVectorizerModel Fit(DataFrame dataFrame) =>
|
||||
public override CountVectorizerModel Fit(DataFrame dataFrame) =>
|
||||
new CountVectorizerModel((JvmObjectReference)Reference.Invoke("fit", dataFrame));
|
||||
|
||||
/// <summary>
|
||||
|
@ -49,8 +52,8 @@ namespace Microsoft.Spark.ML.Feature
|
|||
public static CountVectorizer Load(string path) =>
|
||||
WrapAsCountVectorizer((JvmObjectReference)
|
||||
SparkEnvironment.JvmBridge.CallStaticJavaMethod(
|
||||
s_countVectorizerClassName,"load", path));
|
||||
|
||||
s_countVectorizerClassName, "load", path));
|
||||
|
||||
/// <summary>
|
||||
/// Gets the binary toggle to control the output vector values. If True, all nonzero counts
|
||||
/// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic
|
||||
|
@ -58,7 +61,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <returns>boolean</returns>
|
||||
public bool GetBinary() => (bool)Reference.Invoke("getBinary");
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Sets the binary toggle to control the output vector values. If True, all nonzero counts
|
||||
/// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic
|
||||
|
@ -75,7 +78,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <returns>The input column of type string</returns>
|
||||
public string GetInputCol() => (string)Reference.Invoke("getInputCol");
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Sets the column that the <see cref="CountVectorizer"/> should read from.
|
||||
/// </summary>
|
||||
|
@ -83,14 +86,14 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <returns><see cref="CountVectorizer"/> with the input column set</returns>
|
||||
public CountVectorizer SetInputCol(string value) =>
|
||||
WrapAsCountVectorizer((JvmObjectReference)Reference.Invoke("setInputCol", value));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the name of the new column the <see cref="CountVectorizer"/> creates in the
|
||||
/// DataFrame.
|
||||
/// </summary>
|
||||
/// <returns>The name of the output column.</returns>
|
||||
public string GetOutputCol() => (string)Reference.Invoke("getOutputCol");
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Sets the name of the new column the <see cref="CountVectorizer"/> creates in the
|
||||
/// DataFrame.
|
||||
|
@ -99,7 +102,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <returns>New <see cref="CountVectorizer"/> with the output column set</returns>
|
||||
public CountVectorizer SetOutputCol(string value) =>
|
||||
WrapAsCountVectorizer((JvmObjectReference)Reference.Invoke("setOutputCol", value));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the maximum number of different documents a term could appear in to be included in
|
||||
/// the vocabulary. A term that appears more than the threshold will be ignored. If this is
|
||||
|
@ -123,7 +126,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
[Since(Versions.V2_4_0)]
|
||||
public CountVectorizer SetMaxDF(double value) =>
|
||||
WrapAsCountVectorizer((JvmObjectReference)Reference.Invoke("setMaxDF", value));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the minimum number of different documents a term must appear in to be included in
|
||||
/// the vocabulary. If this is an integer greater than or equal to 1, this specifies the
|
||||
|
@ -132,7 +135,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <returns>The minimum document term frequency</returns>
|
||||
public double GetMinDF() => (double)Reference.Invoke("getMinDF");
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Sets the minimum number of different documents a term must appear in to be included in
|
||||
/// the vocabulary. If this is an integer greater than or equal to 1, this specifies the
|
||||
|
@ -143,7 +146,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <returns>New <see cref="CountVectorizer"/> with the min df value set</returns>
|
||||
public CountVectorizer SetMinDF(double value) =>
|
||||
WrapAsCountVectorizer((JvmObjectReference)Reference.Invoke("setMinDF", value));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the filter to ignore rare words in a document. For each document, terms with
|
||||
/// frequency/count less than the given threshold are ignored. If this is an integer
|
||||
|
@ -171,7 +174,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <returns>New <see cref="CountVectorizer"/> with the min term frequency set</returns>
|
||||
public CountVectorizer SetMinTF(double value) =>
|
||||
WrapAsCountVectorizer((JvmObjectReference)Reference.Invoke("setMinTF", value));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the max size of the vocabulary. <see cref="CountVectorizer"/> will build a
|
||||
/// vocabulary that only considers the top vocabSize terms ordered by term frequency across
|
||||
|
@ -179,7 +182,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <returns>The max size of the vocabulary of type int.</returns>
|
||||
public int GetVocabSize() => (int)Reference.Invoke("getVocabSize");
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Sets the max size of the vocabulary. <see cref="CountVectorizer"/> will build a
|
||||
/// vocabulary that only considers the top vocabSize terms ordered by term frequency across
|
||||
|
@ -187,10 +190,31 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">The max vocabulary size</param>
|
||||
/// <returns><see cref="CountVectorizer"/> with the max vocab value set</returns>
|
||||
public CountVectorizer SetVocabSize(int value) =>
|
||||
public CountVectorizer SetVocabSize(int value) =>
|
||||
WrapAsCountVectorizer(Reference.Invoke("setVocabSize", value));
|
||||
|
||||
private static CountVectorizer WrapAsCountVectorizer(object obj) =>
|
||||
|
||||
/// <summary>
|
||||
/// Saves the object so that it can be loaded later using Load. Note that these objects
|
||||
/// can be shared with Scala by Loading or Saving in Scala.
|
||||
/// </summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
public void Save(string path) => Reference.Invoke("save", path);
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLWriter instance.
|
||||
/// </summary>
|
||||
/// <returns>a <see cref="JavaMLWriter"/> instance for this ML instance.</returns>
|
||||
public JavaMLWriter Write() =>
|
||||
new JavaMLWriter((JvmObjectReference)Reference.Invoke("write"));
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLReader instance.
|
||||
/// </summary>
|
||||
/// <returns>an <see cref="JavaMLReader<CountVectorizer>"/> instance for this ML instance.</returns>
|
||||
public JavaMLReader<CountVectorizer> Read() =>
|
||||
new JavaMLReader<CountVectorizer>((JvmObjectReference)Reference.Invoke("read"));
|
||||
|
||||
private static CountVectorizer WrapAsCountVectorizer(object obj) =>
|
||||
new CountVectorizer((JvmObjectReference)obj);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,16 +10,19 @@ using Microsoft.Spark.Sql.Types;
|
|||
|
||||
namespace Microsoft.Spark.ML.Feature
|
||||
{
|
||||
public class CountVectorizerModel : FeatureBase<CountVectorizerModel>
|
||||
public class CountVectorizerModel :
|
||||
JavaModel<CountVectorizerModel>,
|
||||
IJavaMLWritable,
|
||||
IJavaMLReadable<CountVectorizerModel>
|
||||
{
|
||||
private static readonly string s_countVectorizerModelClassName =
|
||||
private static readonly string s_countVectorizerModelClassName =
|
||||
"org.apache.spark.ml.feature.CountVectorizerModel";
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Creates a <see cref="CountVectorizerModel"/> without any parameters
|
||||
/// </summary>
|
||||
/// <param name="vocabulary">The vocabulary to use</param>
|
||||
public CountVectorizerModel(List<string> vocabulary)
|
||||
public CountVectorizerModel(List<string> vocabulary)
|
||||
: this(SparkEnvironment.JvmBridge.CallConstructor(
|
||||
s_countVectorizerModelClassName, vocabulary))
|
||||
{
|
||||
|
@ -31,16 +34,16 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
|
||||
/// <param name="vocabulary">The vocabulary to use</param>
|
||||
public CountVectorizerModel(string uid, List<string> vocabulary)
|
||||
public CountVectorizerModel(string uid, List<string> vocabulary)
|
||||
: this(SparkEnvironment.JvmBridge.CallConstructor(
|
||||
s_countVectorizerModelClassName, uid, vocabulary))
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
internal CountVectorizerModel(JvmObjectReference jvmObject) : base(jvmObject)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Loads the <see cref="CountVectorizerModel"/> that was previously saved using Save
|
||||
/// </summary>
|
||||
|
@ -52,7 +55,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
WrapAsCountVectorizerModel(
|
||||
SparkEnvironment.JvmBridge.CallStaticJavaMethod(
|
||||
s_countVectorizerModelClassName, "load", path));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the binary toggle to control the output vector values. If True, all nonzero counts
|
||||
/// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic
|
||||
|
@ -79,7 +82,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <returns>string, the input column</returns>
|
||||
public string GetInputCol() => (string)Reference.Invoke("getInputCol");
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Sets the column that the <see cref="CountVectorizerModel"/> should read from.
|
||||
/// </summary>
|
||||
|
@ -87,14 +90,14 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <returns><see cref="CountVectorizerModel"/> with the input column set</returns>
|
||||
public CountVectorizerModel SetInputCol(string value) =>
|
||||
WrapAsCountVectorizerModel(Reference.Invoke("setInputCol", value));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the name of the new column the <see cref="CountVectorizerModel"/> will create in
|
||||
/// the DataFrame.
|
||||
/// </summary>
|
||||
/// <returns>The name of the output column.</returns>
|
||||
public string GetOutputCol() => (string)Reference.Invoke("getOutputCol");
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Sets the name of the new column the <see cref="CountVectorizerModel"/> will create in
|
||||
/// the DataFrame.
|
||||
|
@ -103,7 +106,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <returns>New <see cref="CountVectorizerModel"/> with the output column set</returns>
|
||||
public CountVectorizerModel SetOutputCol(string value) =>
|
||||
WrapAsCountVectorizerModel(Reference.Invoke("setOutputCol", value));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the maximum number of different documents a term could appear in to be included in
|
||||
/// the vocabulary. A term that appears more than the threshold will be ignored. If this is
|
||||
|
@ -113,7 +116,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <returns>The maximum document term frequency of type double.</returns>
|
||||
public double GetMaxDF() => (double)Reference.Invoke("getMaxDF");
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the minimum number of different documents a term must appear in to be included in
|
||||
/// the vocabulary. If this is an integer greater than or equal to 1, this specifies the
|
||||
|
@ -152,7 +155,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </returns>
|
||||
public CountVectorizerModel SetMinTF(double value) =>
|
||||
WrapAsCountVectorizerModel(Reference.Invoke("setMinTF", value));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the max size of the vocabulary. <see cref="CountVectorizerModel"/> will build a
|
||||
/// vocabulary that only considers the top vocabSize terms ordered by term frequency across
|
||||
|
@ -160,7 +163,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <returns>The max size of the vocabulary of type int.</returns>
|
||||
public int GetVocabSize() => (int)Reference.Invoke("getVocabSize");
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Check transform validity and derive the output schema from the input schema.
|
||||
///
|
||||
|
@ -177,21 +180,42 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// The <see cref="StructType"/> of the output schema that would have been derived from the
|
||||
/// input schema, if Transform had been called.
|
||||
/// </returns>
|
||||
public StructType TransformSchema(StructType value) =>
|
||||
public override StructType TransformSchema(StructType value) =>
|
||||
new StructType(
|
||||
(JvmObjectReference)Reference.Invoke(
|
||||
"transformSchema",
|
||||
"transformSchema",
|
||||
DataType.FromJson(Reference.Jvm, value.Json)));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Converts a DataFrame with a text document to a sparse vector of token counts.
|
||||
/// </summary>
|
||||
/// <param name="document"><see cref="DataFrame"/> to transform</param>
|
||||
/// <returns><see cref="DataFrame"/> containing the original data and the counts</returns>
|
||||
public DataFrame Transform(DataFrame document) =>
|
||||
public override DataFrame Transform(DataFrame document) =>
|
||||
new DataFrame((JvmObjectReference)Reference.Invoke("transform", document));
|
||||
|
||||
private static CountVectorizerModel WrapAsCountVectorizerModel(object obj) =>
|
||||
|
||||
/// <summary>
|
||||
/// Saves the object so that it can be loaded later using Load. Note that these objects
|
||||
/// can be shared with Scala by Loading or Saving in Scala.
|
||||
/// </summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
public void Save(string path) => Reference.Invoke("save", path);
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLWriter instance.
|
||||
/// </summary>
|
||||
/// <returns>a <see cref="JavaMLWriter"/> instance for this ML instance.</returns>
|
||||
public JavaMLWriter Write() =>
|
||||
new JavaMLWriter((JvmObjectReference)Reference.Invoke("write"));
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLReader instance.
|
||||
/// </summary>
|
||||
/// <returns>an <see cref="JavaMLReader<CountVectorizerModel>"/> instance for this ML instance.</returns>
|
||||
public JavaMLReader<CountVectorizerModel> Read() =>
|
||||
new JavaMLReader<CountVectorizerModel>((JvmObjectReference)Reference.Invoke("read"));
|
||||
|
||||
private static CountVectorizerModel WrapAsCountVectorizerModel(object obj) =>
|
||||
new CountVectorizerModel((JvmObjectReference)obj);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
|
||||
namespace Microsoft.Spark.ML.Feature
|
||||
{
|
||||
/// <summary>
|
||||
/// A helper interface for JavaEstimator, so that when we have an array of JavaEstimators
|
||||
/// with different type params, we can hold all of them with Estimator<object>.
|
||||
/// </summary>
|
||||
public interface IEstimator<out M>
|
||||
{
|
||||
M Fit(DataFrame dataset);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Abstract Class for estimators that fit models to data.
|
||||
/// </summary>
|
||||
/// <typeparam name="M"/>
|
||||
public abstract class JavaEstimator<M> : JavaPipelineStage, IEstimator<M> where M : JavaModel<M>
|
||||
{
|
||||
internal JavaEstimator(string className) : base(className)
|
||||
{
|
||||
}
|
||||
|
||||
internal JavaEstimator(string className, string uid) : base(className, uid)
|
||||
{
|
||||
}
|
||||
|
||||
internal JavaEstimator(JvmObjectReference jvmObject) : base(jvmObject)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Fits a model to the input data.
|
||||
/// </summary>
|
||||
/// <param name="dataset">input dataset.</param>
|
||||
/// <returns>fitted model</returns>
|
||||
public virtual M Fit(DataFrame dataset) =>
|
||||
WrapAsType<M>((JvmObjectReference)Reference.Invoke("fit", dataset));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
|
||||
namespace Microsoft.Spark.ML.Feature
|
||||
{
|
||||
/// <summary>
|
||||
/// <see cref="JavaEvaluator"/> Abstract Class for evaluators that compute metrics from predictions.
|
||||
/// </summary>
|
||||
public abstract class JavaEvaluator : Params
|
||||
{
|
||||
internal JavaEvaluator(string className) : base(className)
|
||||
{
|
||||
}
|
||||
|
||||
internal JavaEvaluator(string className, string uid) : base(className, uid)
|
||||
{
|
||||
}
|
||||
|
||||
internal JavaEvaluator(JvmObjectReference jvmObject) : base(jvmObject)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Evaluates model output and returns a scalar metric.
|
||||
/// The value of isLargerBetter specifies whether larger values are better.
|
||||
/// </summary>
|
||||
/// <param name="dataset">a dataset that contains labels/observations and predictions.</param>
|
||||
/// <returns>metric</returns>
|
||||
public virtual double Evaluate(DataFrame dataset) =>
|
||||
(double)Reference.Invoke("evaluate", dataset);
|
||||
|
||||
/// <summary>
|
||||
/// Indicates whether the metric returned by evaluate should be maximized
|
||||
/// (true, default) or minimized (false).
|
||||
/// A given evaluator may support multiple metrics which may be maximized or minimized.
|
||||
/// </summary>
|
||||
/// <returns>bool</returns>
|
||||
public bool IsLargerBetter =>
|
||||
(bool)Reference.Invoke("isLargerBetter");
|
||||
}
|
||||
}
|
|
@ -3,7 +3,6 @@
|
|||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Microsoft.Spark.Interop;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
using Microsoft.Spark.Sql;
|
||||
|
@ -11,11 +10,14 @@ using Microsoft.Spark.Sql.Types;
|
|||
|
||||
namespace Microsoft.Spark.ML.Feature
|
||||
{
|
||||
public class FeatureHasher: FeatureBase<FeatureHasher>
|
||||
public class FeatureHasher :
|
||||
JavaTransformer,
|
||||
IJavaMLWritable,
|
||||
IJavaMLReadable<FeatureHasher>
|
||||
{
|
||||
private static readonly string s_featureHasherClassName =
|
||||
private static readonly string s_featureHasherClassName =
|
||||
"org.apache.spark.ml.feature.FeatureHasher";
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Creates a <see cref="FeatureHasher"/> without any parameters.
|
||||
/// </summary>
|
||||
|
@ -35,7 +37,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
internal FeatureHasher(JvmObjectReference jvmObject) : base(jvmObject)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Loads the <see cref="FeatureHasher"/> that was previously saved using Save.
|
||||
/// </summary>
|
||||
|
@ -49,22 +51,22 @@ namespace Microsoft.Spark.ML.Feature
|
|||
s_featureHasherClassName,
|
||||
"load",
|
||||
path));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets a list of the columns which have been specified as categorical columns.
|
||||
/// </summary>
|
||||
/// <returns>List of categorical columns, set by SetCategoricalCols</returns>
|
||||
public IEnumerable<string> GetCategoricalCols() =>
|
||||
public IEnumerable<string> GetCategoricalCols() =>
|
||||
(string[])Reference.Invoke("getCategoricalCols");
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Marks columns as categorical columns.
|
||||
/// </summary>
|
||||
/// <param name="value">List of column names to mark as categorical columns</param>
|
||||
/// <returns>New <see cref="FeatureHasher"/> object</returns>
|
||||
public FeatureHasher SetCategoricalCols(IEnumerable<string> value) =>
|
||||
public FeatureHasher SetCategoricalCols(IEnumerable<string> value) =>
|
||||
WrapAsFeatureHasher(Reference.Invoke("setCategoricalCols", value));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the columns that the <see cref="FeatureHasher"/> should read from and convert into
|
||||
/// hashes. This would have been set by SetInputCol.
|
||||
|
@ -78,9 +80,9 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">The name of the column to as use the source of the hash</param>
|
||||
/// <returns>New <see cref="FeatureHasher"/> object</returns>
|
||||
public FeatureHasher SetInputCols(IEnumerable<string> value) =>
|
||||
public FeatureHasher SetInputCols(IEnumerable<string> value) =>
|
||||
WrapAsFeatureHasher(Reference.Invoke("setInputCols", value));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the number of features that should be used. Since a simple modulo is used to
|
||||
/// transform the hash function to a column index, it is advisable to use a power of two
|
||||
|
@ -98,9 +100,9 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">int value of number of features</param>
|
||||
/// <returns>New <see cref="FeatureHasher"/> object</returns>
|
||||
public FeatureHasher SetNumFeatures(int value) =>
|
||||
public FeatureHasher SetNumFeatures(int value) =>
|
||||
WrapAsFeatureHasher(Reference.Invoke("setNumFeatures", value));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the name of the column the output data will be written to. This is set by
|
||||
/// SetInputCol.
|
||||
|
@ -113,18 +115,18 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">The name of the new column which will contain the hash</param>
|
||||
/// <returns>New <see cref="FeatureHasher"/> object</returns>
|
||||
public FeatureHasher SetOutputCol(string value) =>
|
||||
public FeatureHasher SetOutputCol(string value) =>
|
||||
WrapAsFeatureHasher(Reference.Invoke("setOutputCol", value));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Transforms the input <see cref="DataFrame"/>. It is recommended that you validate that
|
||||
/// the transform will succeed by calling TransformSchema.
|
||||
/// </summary>
|
||||
/// <param name="value">Input <see cref="DataFrame"/> to transform</param>
|
||||
/// <returns>Transformed <see cref="DataFrame"/></returns>
|
||||
public DataFrame Transform(DataFrame value) =>
|
||||
public override DataFrame Transform(DataFrame value) =>
|
||||
new DataFrame((JvmObjectReference)Reference.Invoke("transform", value));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Check transform validity and derive the output schema from the input schema.
|
||||
///
|
||||
|
@ -141,13 +143,34 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// The <see cref="StructType"/> of the output schema that would have been derived from the
|
||||
/// input schema, if Transform had been called.
|
||||
/// </returns>
|
||||
public StructType TransformSchema(StructType value) =>
|
||||
public override StructType TransformSchema(StructType value) =>
|
||||
new StructType(
|
||||
(JvmObjectReference)Reference.Invoke(
|
||||
"transformSchema",
|
||||
"transformSchema",
|
||||
DataType.FromJson(Reference.Jvm, value.Json)));
|
||||
|
||||
private static FeatureHasher WrapAsFeatureHasher(object obj) =>
|
||||
/// <summary>
|
||||
/// Saves the object so that it can be loaded later using Load. Note that these objects
|
||||
/// can be shared with Scala by Loading or Saving in Scala.
|
||||
/// </summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
public void Save(string path) => Reference.Invoke("save", path);
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLWriter instance.
|
||||
/// </summary>
|
||||
/// <returns>a <see cref="JavaMLWriter"/> instance for this ML instance.</returns>
|
||||
public JavaMLWriter Write() =>
|
||||
new JavaMLWriter((JvmObjectReference)Reference.Invoke("write"));
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLReader instance.
|
||||
/// </summary>
|
||||
/// <returns>an <see cref="JavaMLReader<FeatureHasher>"/> instance for this ML instance.</returns>
|
||||
public JavaMLReader<FeatureHasher> Read() =>
|
||||
new JavaMLReader<FeatureHasher>((JvmObjectReference)Reference.Invoke("read"));
|
||||
|
||||
private static FeatureHasher WrapAsFeatureHasher(object obj) =>
|
||||
new FeatureHasher((JvmObjectReference)obj);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,12 +2,9 @@
|
|||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Microsoft.Spark.Interop;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.Sql.Types;
|
||||
|
||||
namespace Microsoft.Spark.ML.Feature
|
||||
{
|
||||
|
@ -19,9 +16,12 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// power of two as the numFeatures parameter; otherwise the features will not be mapped evenly
|
||||
/// to the columns.
|
||||
/// </summary>
|
||||
public class HashingTF : FeatureBase<HashingTF>
|
||||
public class HashingTF :
|
||||
JavaTransformer,
|
||||
IJavaMLWritable,
|
||||
IJavaMLReadable<HashingTF>
|
||||
{
|
||||
private static readonly string s_hashingTfClassName =
|
||||
private static readonly string s_hashingTfClassName =
|
||||
"org.apache.spark.ml.feature.HashingTF";
|
||||
|
||||
/// <summary>
|
||||
|
@ -39,7 +39,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
public HashingTF(string uid) : base(s_hashingTfClassName, uid)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
internal HashingTF(JvmObjectReference jvmObject) : base(jvmObject)
|
||||
{
|
||||
}
|
||||
|
@ -66,7 +66,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// models that model binary events rather than integer counts
|
||||
///</summary>
|
||||
/// <param name="value">binary toggle, default is false</param>
|
||||
public HashingTF SetBinary(bool value) =>
|
||||
public HashingTF SetBinary(bool value) =>
|
||||
WrapAsHashingTF(Reference.Invoke("setBinary", value));
|
||||
|
||||
/// <summary>
|
||||
|
@ -80,7 +80,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">The name of the column to as the source</param>
|
||||
/// <returns>New <see cref="HashingTF"/> object</returns>
|
||||
public HashingTF SetInputCol(string value) =>
|
||||
public HashingTF SetInputCol(string value) =>
|
||||
WrapAsHashingTF(Reference.Invoke("setInputCol", value));
|
||||
|
||||
/// <summary>
|
||||
|
@ -96,7 +96,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">The name of the new column</param>
|
||||
/// <returns>New <see cref="HashingTF"/> object</returns>
|
||||
public HashingTF SetOutputCol(string value) =>
|
||||
public HashingTF SetOutputCol(string value) =>
|
||||
WrapAsHashingTF(Reference.Invoke("setOutputCol", value));
|
||||
|
||||
/// <summary>
|
||||
|
@ -116,7 +116,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">int</param>
|
||||
/// <returns>New <see cref="HashingTF"/> object</returns>
|
||||
public HashingTF SetNumFeatures(int value) =>
|
||||
public HashingTF SetNumFeatures(int value) =>
|
||||
WrapAsHashingTF(Reference.Invoke("setNumFeatures", value));
|
||||
|
||||
/// <summary>
|
||||
|
@ -125,10 +125,31 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="source">The <see cref="DataFrame"/> to add the tokens to</param>
|
||||
/// <returns><see cref="DataFrame"/> containing the original data and the tokens</returns>
|
||||
public DataFrame Transform(DataFrame source) =>
|
||||
public override DataFrame Transform(DataFrame source) =>
|
||||
new DataFrame((JvmObjectReference)Reference.Invoke("transform", source));
|
||||
|
||||
private static HashingTF WrapAsHashingTF(object obj) =>
|
||||
/// <summary>
|
||||
/// Saves the object so that it can be loaded later using Load. Note that these objects
|
||||
/// can be shared with Scala by Loading or Saving in Scala.
|
||||
/// </summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
public void Save(string path) => Reference.Invoke("save", path);
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLWriter instance.
|
||||
/// </summary>
|
||||
/// <returns>a <see cref="JavaMLWriter"/> instance for this ML instance.</returns>
|
||||
public JavaMLWriter Write() =>
|
||||
new JavaMLWriter((JvmObjectReference)Reference.Invoke("write"));
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLReader instance.
|
||||
/// </summary>
|
||||
/// <returns>an <see cref="JavaMLReader<HashingTF>"/> instance for this ML instance.</returns>
|
||||
public JavaMLReader<HashingTF> Read() =>
|
||||
new JavaMLReader<HashingTF>((JvmObjectReference)Reference.Invoke("read"));
|
||||
|
||||
private static HashingTF WrapAsHashingTF(object obj) =>
|
||||
new HashingTF((JvmObjectReference)obj);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,10 +17,13 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// of documents (controlled by the variable minDocFreq). For terms that are not in at least
|
||||
/// minDocFreq documents, the IDF is found as 0, resulting in TF-IDFs of 0.
|
||||
/// </summary>
|
||||
public class IDF : FeatureBase<IDF>
|
||||
public class IDF :
|
||||
JavaEstimator<IDFModel>,
|
||||
IJavaMLWritable,
|
||||
IJavaMLReadable<IDF>
|
||||
{
|
||||
private static readonly string s_IDFClassName = "org.apache.spark.ml.feature.IDF";
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Create a <see cref="IDF"/> without any parameters
|
||||
/// </summary>
|
||||
|
@ -36,11 +39,11 @@ namespace Microsoft.Spark.ML.Feature
|
|||
public IDF(string uid) : base(s_IDFClassName, uid)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
internal IDF(JvmObjectReference jvmObject) : base(jvmObject)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the column that the <see cref="IDF"/> should read from
|
||||
/// </summary>
|
||||
|
@ -67,7 +70,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">The name of the new column</param>
|
||||
/// <returns>New <see cref="IDF"/> object</returns>
|
||||
public IDF SetOutputCol(string value) =>
|
||||
public IDF SetOutputCol(string value) =>
|
||||
WrapAsIDF(Reference.Invoke("setOutputCol", value));
|
||||
|
||||
/// <summary>
|
||||
|
@ -81,7 +84,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">int, the minimum of documents a term should appear in</param>
|
||||
/// <returns>New <see cref="IDF"/> object</returns>
|
||||
public IDF SetMinDocFreq(int value) =>
|
||||
public IDF SetMinDocFreq(int value) =>
|
||||
WrapAsIDF(Reference.Invoke("setMinDocFreq", value));
|
||||
|
||||
/// <summary>
|
||||
|
@ -89,7 +92,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="source">The <see cref="DataFrame"/> to fit the model to</param>
|
||||
/// <returns>New <see cref="IDFModel"/> object</returns>
|
||||
public IDFModel Fit(DataFrame source) =>
|
||||
public override IDFModel Fit(DataFrame source) =>
|
||||
new IDFModel((JvmObjectReference)Reference.Invoke("fit", source));
|
||||
|
||||
/// <summary>
|
||||
|
@ -102,7 +105,28 @@ namespace Microsoft.Spark.ML.Feature
|
|||
return WrapAsIDF(
|
||||
SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_IDFClassName, "load", path));
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Saves the object so that it can be loaded later using Load. Note that these objects
|
||||
/// can be shared with Scala by Loading or Saving in Scala.
|
||||
/// </summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
public void Save(string path) => Reference.Invoke("save", path);
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLWriter instance.
|
||||
/// </summary>
|
||||
/// <returns>a <see cref="JavaMLWriter"/> instance for this ML instance.</returns>
|
||||
public JavaMLWriter Write() =>
|
||||
new JavaMLWriter((JvmObjectReference)Reference.Invoke("write"));
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLReader instance.
|
||||
/// </summary>
|
||||
/// <returns>an <see cref="JavaMLReader<IDF>"/> instance for this ML instance.</returns>
|
||||
public JavaMLReader<IDF> Read() =>
|
||||
new JavaMLReader<IDF>((JvmObjectReference)Reference.Invoke("read"));
|
||||
|
||||
private static IDF WrapAsIDF(object obj) => new IDF((JvmObjectReference)obj);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,11 +12,14 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// A <see cref="IDFModel"/> that converts the input string to lowercase and then splits it by
|
||||
/// white spaces.
|
||||
/// </summary>
|
||||
public class IDFModel : FeatureBase<IDFModel>
|
||||
public class IDFModel :
|
||||
JavaModel<IDFModel>,
|
||||
IJavaMLWritable,
|
||||
IJavaMLReadable<IDFModel>
|
||||
{
|
||||
private static readonly string s_IDFModelClassName =
|
||||
private static readonly string s_IDFModelClassName =
|
||||
"org.apache.spark.ml.feature.IDFModel";
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Create a <see cref="IDFModel"/> without any parameters
|
||||
/// </summary>
|
||||
|
@ -32,11 +35,11 @@ namespace Microsoft.Spark.ML.Feature
|
|||
public IDFModel(string uid) : base(s_IDFModelClassName, uid)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
internal IDFModel(JvmObjectReference jvmObject) : base(jvmObject)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the column that the <see cref="IDFModel"/> should read from
|
||||
/// </summary>
|
||||
|
@ -49,7 +52,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">The name of the column to as the source</param>
|
||||
/// <returns>New <see cref="IDFModel"/> object</returns>
|
||||
public IDFModel SetInputCol(string value) =>
|
||||
public IDFModel SetInputCol(string value) =>
|
||||
WrapAsIDFModel(Reference.Invoke("setInputCol", value));
|
||||
|
||||
/// <summary>
|
||||
|
@ -66,7 +69,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <param name="value">The name of the new column which contains the tokens
|
||||
/// </param>
|
||||
/// <returns>New <see cref="IDFModel"/> object</returns>
|
||||
public IDFModel SetOutputCol(string value) =>
|
||||
public IDFModel SetOutputCol(string value) =>
|
||||
WrapAsIDFModel(Reference.Invoke("setOutputCol", value));
|
||||
|
||||
/// <summary>
|
||||
|
@ -81,7 +84,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="source">The <see cref="DataFrame"/> to add the tokens to</param>
|
||||
/// <returns><see cref="DataFrame"/> containing the original data and the tokens</returns>
|
||||
public DataFrame Transform(DataFrame source) =>
|
||||
public override DataFrame Transform(DataFrame source) =>
|
||||
new DataFrame((JvmObjectReference)Reference.Invoke("transform", source));
|
||||
|
||||
/// <summary>
|
||||
|
@ -96,7 +99,28 @@ namespace Microsoft.Spark.ML.Feature
|
|||
s_IDFModelClassName, "load", path));
|
||||
}
|
||||
|
||||
private static IDFModel WrapAsIDFModel(object obj) =>
|
||||
/// <summary>
|
||||
/// Saves the object so that it can be loaded later using Load. Note that these objects
|
||||
/// can be shared with Scala by Loading or Saving in Scala.
|
||||
/// </summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
public void Save(string path) => Reference.Invoke("save", path);
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLWriter instance.
|
||||
/// </summary>
|
||||
/// <returns>a <see cref="JavaMLWriter"/> instance for this ML instance.</returns>
|
||||
public JavaMLWriter Write() =>
|
||||
new JavaMLWriter((JvmObjectReference)Reference.Invoke("write"));
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLReader instance.
|
||||
/// </summary>
|
||||
/// <returns>an <see cref="JavaMLReader<IDFModel>"/> instance for this ML instance.</returns>
|
||||
public JavaMLReader<IDFModel> Read() =>
|
||||
new JavaMLReader<IDFModel>((JvmObjectReference)Reference.Invoke("read"));
|
||||
|
||||
private static IDFModel WrapAsIDFModel(object obj) =>
|
||||
new IDFModel((JvmObjectReference)obj);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
|
||||
namespace Microsoft.Spark.ML.Feature
|
||||
{
|
||||
/// <summary>
|
||||
/// A helper interface for JavaModel, so that when we have an array of JavaModels
|
||||
/// with different type params, we can hold all of them with Model<object>.
|
||||
/// </summary>
|
||||
public interface IModel<out M>
|
||||
{
|
||||
bool HasParent();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A fitted model, i.e., a Transformer produced by an Estimator.
|
||||
/// </summary>
|
||||
/// <typeparam name="M">
|
||||
/// Model Type.
|
||||
/// </typeparam>
|
||||
public abstract class JavaModel<M> : JavaTransformer, IModel<M> where M : JavaModel<M>
|
||||
{
|
||||
internal JavaModel(string className) : base(className)
|
||||
{
|
||||
}
|
||||
|
||||
internal JavaModel(string className, string uid) : base(className, uid)
|
||||
{
|
||||
}
|
||||
|
||||
internal JavaModel(JvmObjectReference jvmObject) : base(jvmObject)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sets the parent of this model.
|
||||
/// </summary>
|
||||
/// <param name="parent">The parent of the JavaModel to be set</param>
|
||||
/// <returns>type parameter M</returns>
|
||||
public M SetParent(JavaEstimator<M> parent) =>
|
||||
WrapAsType<M>((JvmObjectReference)Reference.Invoke("setParent", parent));
|
||||
|
||||
/// <summary>
|
||||
/// Indicates whether this Model has a corresponding parent.
|
||||
/// </summary>
|
||||
/// <returns>bool</returns>
|
||||
public bool HasParent() =>
|
||||
(bool)Reference.Invoke("hasParent");
|
||||
}
|
||||
}
|
|
@ -14,7 +14,10 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// an array of n-grams. Null values in the input array are ignored. It returns an array
|
||||
/// of n-grams where each n-gram is represented by a space-separated string of words.
|
||||
/// </summary>
|
||||
public class NGram : FeatureBase<NGram>
|
||||
public class NGram :
|
||||
JavaTransformer,
|
||||
IJavaMLWritable,
|
||||
IJavaMLReadable<NGram>
|
||||
{
|
||||
private static readonly string s_nGramClassName =
|
||||
"org.apache.spark.ml.feature.NGram";
|
||||
|
@ -87,7 +90,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <returns>
|
||||
/// New <see cref="DataFrame"/> object with the source <see cref="DataFrame"/> transformed.
|
||||
/// </returns>
|
||||
public DataFrame Transform(DataFrame source) =>
|
||||
public override DataFrame Transform(DataFrame source) =>
|
||||
new DataFrame((JvmObjectReference)Reference.Invoke("transform", source));
|
||||
|
||||
/// <summary>
|
||||
|
@ -106,7 +109,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// The <see cref="StructType"/> of the output schema that would have been derived from the
|
||||
/// input schema, if Transform had been called.
|
||||
/// </returns>
|
||||
public StructType TransformSchema(StructType value) =>
|
||||
public override StructType TransformSchema(StructType value) =>
|
||||
new StructType(
|
||||
(JvmObjectReference)Reference.Invoke(
|
||||
"transformSchema",
|
||||
|
@ -124,6 +127,27 @@ namespace Microsoft.Spark.ML.Feature
|
|||
"load",
|
||||
path));
|
||||
|
||||
/// <summary>
|
||||
/// Saves the object so that it can be loaded later using Load. Note that these objects
|
||||
/// can be shared with Scala by Loading or Saving in Scala.
|
||||
/// </summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
public void Save(string path) => Reference.Invoke("save", path);
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLWriter instance.
|
||||
/// </summary>
|
||||
/// <returns>a <see cref="JavaMLWriter"/> instance for this ML instance.</returns>
|
||||
public JavaMLWriter Write() =>
|
||||
new JavaMLWriter((JvmObjectReference)Reference.Invoke("write"));
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLReader instance.
|
||||
/// </summary>
|
||||
/// <returns>an <see cref="JavaMLReader<NGram>"/> instance for this ML instance.</returns>
|
||||
public JavaMLReader<NGram> Read() =>
|
||||
new JavaMLReader<NGram>((JvmObjectReference)Reference.Invoke("read"));
|
||||
|
||||
private static NGram WrapAsNGram(object obj) => new NGram((JvmObjectReference)obj);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,120 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.Reflection;
|
||||
using Microsoft.Spark.Interop;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
using Microsoft.Spark.Sql;
|
||||
|
||||
namespace Microsoft.Spark.ML.Feature
|
||||
{
|
||||
/// <summary>
|
||||
/// <see cref="Pipeline"/> A simple pipeline, which acts as an estimator.
|
||||
/// A Pipeline consists of a sequence of stages, each of which is either an Estimator or a Transformer.
|
||||
/// When Pipeline.fit is called, the stages are executed in order. If a stage is an Estimator, its
|
||||
/// Estimator.fit method will be called on the input dataset to fit a model. Then the model, which is a
|
||||
/// transformer, will be used to transform the dataset as the input to the next stage.
|
||||
/// If a stage is a Transformer, its Transformer.transform method will be called to produce the
|
||||
/// dataset for the next stage. The fitted model from a Pipeline is a PipelineModel, which consists of
|
||||
/// fitted models and transformers, corresponding to the pipeline
|
||||
/// stages. If there are no stages, the pipeline acts as an identity transformer.
|
||||
/// </summary>
|
||||
public class Pipeline :
|
||||
JavaEstimator<PipelineModel>,
|
||||
IJavaMLWritable,
|
||||
IJavaMLReadable<Pipeline>
|
||||
{
|
||||
private static readonly string s_pipelineClassName = "org.apache.spark.ml.Pipeline";
|
||||
|
||||
/// <summary>
|
||||
/// Creates a <see cref="Pipeline"/> without any parameters.
|
||||
/// </summary>
|
||||
public Pipeline() : base(s_pipelineClassName)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a <see cref="Pipeline"/> with a UID that is used to give the
|
||||
/// <see cref="Pipeline"/> a unique ID.
|
||||
/// </summary>
|
||||
/// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
|
||||
public Pipeline(string uid) : base(s_pipelineClassName, uid)
|
||||
{
|
||||
}
|
||||
|
||||
internal Pipeline(JvmObjectReference jvmObject) : base(jvmObject)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Set the stages of pipeline instance.
|
||||
/// </summary>
|
||||
/// <param name="value">
|
||||
/// A sequence of stages, each of which is either an Estimator or a Transformer.
|
||||
/// </param>
|
||||
/// <returns><see cref="Pipeline"/> object</returns>
|
||||
public Pipeline SetStages(JavaPipelineStage[] value) =>
|
||||
WrapAsPipeline((JvmObjectReference)SparkEnvironment.JvmBridge.CallStaticJavaMethod(
|
||||
"org.apache.spark.mllib.api.dotnet.MLUtils", "setPipelineStages",
|
||||
Reference, value.ToJavaArrayList()));
|
||||
|
||||
/// <summary>
|
||||
/// Get the stages of pipeline instance.
|
||||
/// </summary>
|
||||
/// <returns>A sequence of <see cref="JavaPipelineStage"/> stages</returns>
|
||||
public JavaPipelineStage[] GetStages()
|
||||
{
|
||||
JvmObjectReference[] jvmObjects = (JvmObjectReference[])Reference.Invoke("getStages");
|
||||
JavaPipelineStage[] result = new JavaPipelineStage[jvmObjects.Length];
|
||||
for (int i = 0; i < jvmObjects.Length; i++)
|
||||
{
|
||||
(string constructorClass, string methodName) = DotnetUtils.GetUnderlyingType(jvmObjects[i]);
|
||||
Type type = Type.GetType(constructorClass);
|
||||
MethodInfo method = type.GetMethod(methodName, BindingFlags.NonPublic | BindingFlags.Static);
|
||||
result[i] = (JavaPipelineStage)method.Invoke(null, new object[] { jvmObjects[i] });
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <summary>Fits a model to the input data.</summary>
|
||||
/// <param name="dataset">The <see cref="DataFrame"/> to fit the model to.</param>
|
||||
/// <returns><see cref="PipelineModel"/></returns>
|
||||
override public PipelineModel Fit(DataFrame dataset) =>
|
||||
new PipelineModel(
|
||||
(JvmObjectReference)Reference.Invoke("fit", dataset));
|
||||
|
||||
/// <summary>
|
||||
/// Loads the <see cref="Pipeline"/> that was previously saved using Save(string).
|
||||
/// </summary>
|
||||
/// <param name="path">The path the previous <see cref="Pipeline"/> was saved to</param>
|
||||
/// <returns>New <see cref="Pipeline"/> object, loaded from path.</returns>
|
||||
public static Pipeline Load(string path) => WrapAsPipeline(
|
||||
SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_pipelineClassName, "load", path));
|
||||
|
||||
/// <summary>
|
||||
/// Saves the object so that it can be loaded later using Load. Note that these objects
|
||||
/// can be shared with Scala by Loading or Saving in Scala.
|
||||
/// </summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
public void Save(string path) => Reference.Invoke("save", path);
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLWriter instance.
|
||||
/// </summary>
|
||||
/// <returns>a <see cref="JavaMLWriter"/> instance for this ML instance.</returns>
|
||||
public JavaMLWriter Write() =>
|
||||
new JavaMLWriter((JvmObjectReference)Reference.Invoke("write"));
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLReader instance.
|
||||
/// </summary>
|
||||
/// <returns>an <see cref="JavaMLReader<Pipeline>"/> instance for this ML instance.</returns>
|
||||
public JavaMLReader<Pipeline> Read() =>
|
||||
new JavaMLReader<Pipeline>((JvmObjectReference)Reference.Invoke("read"));
|
||||
|
||||
private static Pipeline WrapAsPipeline(object obj) =>
|
||||
new Pipeline((JvmObjectReference)obj);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using Microsoft.Spark.Interop;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
|
||||
namespace Microsoft.Spark.ML.Feature
|
||||
{
|
||||
/// <summary>
|
||||
/// <see cref="PipelineModel"/> Represents a fitted pipeline.
|
||||
/// </summary>
|
||||
public class PipelineModel :
|
||||
JavaModel<PipelineModel>,
|
||||
IJavaMLWritable,
|
||||
IJavaMLReadable<PipelineModel>
|
||||
{
|
||||
private static readonly string s_pipelineModelClassName = "org.apache.spark.ml.PipelineModel";
|
||||
|
||||
/// <summary>
|
||||
/// Creates a <see cref="PipelineModel"/> with a UID that is used to give the
|
||||
/// <see cref="PipelineModel"/> a unique ID, and an array of transformers as stages.
|
||||
/// </summary>
|
||||
/// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
|
||||
/// <param name="stages">Stages for the PipelineModel.</param>
|
||||
public PipelineModel(string uid, JavaTransformer[] stages)
|
||||
: this(SparkEnvironment.JvmBridge.CallConstructor(
|
||||
s_pipelineModelClassName, uid, stages.ToJavaArrayList()))
|
||||
{
|
||||
}
|
||||
|
||||
internal PipelineModel(JvmObjectReference jvmObject) : base(jvmObject)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Loads the <see cref="PipelineModel"/> that was previously saved using Save(string).
|
||||
/// </summary>
|
||||
/// <param name="path">The path the previous <see cref="PipelineModel"/> was saved to</param>
|
||||
/// <returns>New <see cref="PipelineModel"/> object, loaded from path.</returns>
|
||||
public static PipelineModel Load(string path) => WrapAsPipelineModel(
|
||||
SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_pipelineModelClassName, "load", path));
|
||||
|
||||
/// <summary>
|
||||
/// Saves the object so that it can be loaded later using Load. Note that these objects
|
||||
/// can be shared with Scala by Loading or Saving in Scala.
|
||||
/// </summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
public void Save(string path) => Reference.Invoke("save", path);
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLWriter instance.
|
||||
/// </summary>
|
||||
/// <returns>a <see cref="JavaMLWriter"/> instance for this ML instance.</returns>
|
||||
public JavaMLWriter Write() =>
|
||||
new JavaMLWriter((JvmObjectReference)Reference.Invoke("write"));
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLReader instance.
|
||||
/// </summary>
|
||||
/// <returns>an <see cref="JavaMLReader<PipelineModel>"/> instance for this ML instance.</returns>
|
||||
public JavaMLReader<PipelineModel> Read() =>
|
||||
new JavaMLReader<PipelineModel>((JvmObjectReference)Reference.Invoke("read"));
|
||||
|
||||
private static PipelineModel WrapAsPipelineModel(object obj) =>
|
||||
new PipelineModel((JvmObjectReference)obj);
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.Sql.Types;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
|
||||
namespace Microsoft.Spark.ML.Feature
|
||||
{
|
||||
/// <summary>
|
||||
/// <see cref="JavaPipelineStage"/> A stage in a pipeline, either an Estimator or a Transformer.
|
||||
/// </summary>
|
||||
public abstract class JavaPipelineStage : Params
|
||||
{
|
||||
internal JavaPipelineStage(string className) : base(className)
|
||||
{
|
||||
}
|
||||
|
||||
internal JavaPipelineStage(string className, string uid) : base(className, uid)
|
||||
{
|
||||
}
|
||||
|
||||
internal JavaPipelineStage(JvmObjectReference jvmObject) : base(jvmObject)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Check transform validity and derive the output schema from the input schema.
|
||||
///
|
||||
/// We check validity for interactions between parameters during transformSchema
|
||||
/// and raise an exception if any parameter value is invalid.
|
||||
///
|
||||
/// Typical implementation should first conduct verification on schema change and
|
||||
/// parameter validity, including complex parameter interaction checks.
|
||||
/// </summary>
|
||||
/// <param name="schema">
|
||||
/// The <see cref="StructType"/> of the <see cref="DataFrame"/> which will be transformed.
|
||||
/// </param>
|
||||
/// <returns>
|
||||
/// The <see cref="StructType"/> of the output schema that would have been derived from the
|
||||
/// input schema, if Transform had been called.
|
||||
/// </returns>
|
||||
public virtual StructType TransformSchema(StructType schema) =>
|
||||
new StructType(
|
||||
(JvmObjectReference)Reference.Invoke(
|
||||
"transformSchema",
|
||||
DataType.FromJson(Reference.Jvm, schema.Json)));
|
||||
}
|
||||
}
|
|
@ -12,9 +12,12 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <summary>
|
||||
/// <see cref="SQLTransformer"/> implements the transformations which are defined by SQL statement.
|
||||
/// </summary>
|
||||
public class SQLTransformer : FeatureBase<SQLTransformer>
|
||||
public class SQLTransformer :
|
||||
JavaTransformer,
|
||||
IJavaMLWritable,
|
||||
IJavaMLReadable<SQLTransformer>
|
||||
{
|
||||
private static readonly string s_sqlTransformerClassName =
|
||||
private static readonly string s_sqlTransformerClassName =
|
||||
"org.apache.spark.ml.feature.SQLTransformer";
|
||||
|
||||
/// <summary>
|
||||
|
@ -45,7 +48,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <returns>
|
||||
/// New <see cref="DataFrame"/> object with the source <see cref="DataFrame"/> transformed.
|
||||
/// </returns>
|
||||
public DataFrame Transform(DataFrame source) =>
|
||||
public override DataFrame Transform(DataFrame source) =>
|
||||
new DataFrame((JvmObjectReference)Reference.Invoke("transform", source));
|
||||
|
||||
/// <summary>
|
||||
|
@ -55,7 +58,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <returns>
|
||||
/// New <see cref="StructType"/> object with the schema <see cref="StructType"/> transformed.
|
||||
/// </returns>
|
||||
public StructType TransformSchema(StructType value) =>
|
||||
public override StructType TransformSchema(StructType value) =>
|
||||
new StructType(
|
||||
(JvmObjectReference)Reference.Invoke(
|
||||
"transformSchema",
|
||||
|
@ -82,13 +85,34 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="path">The path the previous <see cref="SQLTransformer"/> was saved to</param>
|
||||
/// <returns>New <see cref="SQLTransformer"/> object, loaded from path</returns>
|
||||
public static SQLTransformer Load(string path) =>
|
||||
public static SQLTransformer Load(string path) =>
|
||||
WrapAsSQLTransformer(
|
||||
SparkEnvironment.JvmBridge.CallStaticJavaMethod(
|
||||
s_sqlTransformerClassName,
|
||||
"load",
|
||||
s_sqlTransformerClassName,
|
||||
"load",
|
||||
path));
|
||||
|
||||
/// <summary>
|
||||
/// Saves the object so that it can be loaded later using Load. Note that these objects
|
||||
/// can be shared with Scala by Loading or Saving in Scala.
|
||||
/// </summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
public void Save(string path) => Reference.Invoke("save", path);
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLWriter instance.
|
||||
/// </summary>
|
||||
/// <returns>a <see cref="JavaMLWriter"/> instance for this ML instance.</returns>
|
||||
public JavaMLWriter Write() =>
|
||||
new JavaMLWriter((JvmObjectReference)Reference.Invoke("write"));
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLReader instance.
|
||||
/// </summary>
|
||||
/// <returns>an <see cref="JavaMLReader<SQLTransformer>"/> instance for this ML instance.</returns>
|
||||
public JavaMLReader<SQLTransformer> Read() =>
|
||||
new JavaMLReader<SQLTransformer>((JvmObjectReference)Reference.Invoke("read"));
|
||||
|
||||
private static SQLTransformer WrapAsSQLTransformer(object obj) =>
|
||||
new SQLTransformer((JvmObjectReference)obj);
|
||||
}
|
||||
|
|
|
@ -13,7 +13,10 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <summary>
|
||||
/// A <see cref="StopWordsRemover"/> feature transformer that filters out stop words from input.
|
||||
/// </summary>
|
||||
public class StopWordsRemover : FeatureBase<StopWordsRemover>
|
||||
public class StopWordsRemover :
|
||||
JavaTransformer,
|
||||
IJavaMLWritable,
|
||||
IJavaMLReadable<StopWordsRemover>
|
||||
{
|
||||
private static readonly string s_stopWordsRemoverClassName =
|
||||
"org.apache.spark.ml.feature.StopWordsRemover";
|
||||
|
@ -63,7 +66,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <returns>
|
||||
/// New <see cref="DataFrame"/> object with the source <see cref="DataFrame"/> transformed
|
||||
/// </returns>
|
||||
public DataFrame Transform(DataFrame source) =>
|
||||
public override DataFrame Transform(DataFrame source) =>
|
||||
new DataFrame((JvmObjectReference)Reference.Invoke("transform", source));
|
||||
|
||||
/// <summary>
|
||||
|
@ -141,7 +144,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// The <see cref="StructType"/> of the output schema that would have been derived from the
|
||||
/// input schema, if Transform had been called.
|
||||
/// </returns>
|
||||
public StructType TransformSchema(StructType value) =>
|
||||
public override StructType TransformSchema(StructType value) =>
|
||||
new StructType(
|
||||
(JvmObjectReference)Reference.Invoke(
|
||||
"transformSchema",
|
||||
|
@ -168,6 +171,27 @@ namespace Microsoft.Spark.ML.Feature
|
|||
WrapAsStopWordsRemover(
|
||||
SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_stopWordsRemoverClassName, "load", path));
|
||||
|
||||
/// <summary>
|
||||
/// Saves the object so that it can be loaded later using Load. Note that these objects
|
||||
/// can be shared with Scala by Loading or Saving in Scala.
|
||||
/// </summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
public void Save(string path) => Reference.Invoke("save", path);
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLWriter instance.
|
||||
/// </summary>
|
||||
/// <returns>a <see cref="JavaMLWriter"/> instance for this ML instance.</returns>
|
||||
public JavaMLWriter Write() =>
|
||||
new JavaMLWriter((JvmObjectReference)Reference.Invoke("write"));
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLReader instance.
|
||||
/// </summary>
|
||||
/// <returns>an <see cref="JavaMLReader<StopWordsRemover>"/> instance for this ML instance.</returns>
|
||||
public JavaMLReader<StopWordsRemover> Read() =>
|
||||
new JavaMLReader<StopWordsRemover>((JvmObjectReference)Reference.Invoke("read"));
|
||||
|
||||
private static StopWordsRemover WrapAsStopWordsRemover(object obj) =>
|
||||
new StopWordsRemover((JvmObjectReference)obj);
|
||||
}
|
||||
|
|
|
@ -12,11 +12,14 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// A <see cref="Tokenizer"/> that converts the input string to lowercase and then splits it by
|
||||
/// white spaces.
|
||||
/// </summary>
|
||||
public class Tokenizer : FeatureBase<Tokenizer>
|
||||
public class Tokenizer :
|
||||
JavaTransformer,
|
||||
IJavaMLWritable,
|
||||
IJavaMLReadable<Tokenizer>
|
||||
{
|
||||
private static readonly string s_tokenizerClassName =
|
||||
private static readonly string s_tokenizerClassName =
|
||||
"org.apache.spark.ml.feature.Tokenizer";
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Create a <see cref="Tokenizer"/> without any parameters
|
||||
/// </summary>
|
||||
|
@ -32,11 +35,11 @@ namespace Microsoft.Spark.ML.Feature
|
|||
public Tokenizer(string uid) : base(s_tokenizerClassName, uid)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
internal Tokenizer(JvmObjectReference jvmObject) : base(jvmObject)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the column that the <see cref="Tokenizer"/> should read from
|
||||
/// </summary>
|
||||
|
@ -48,7 +51,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">The name of the column to as the source</param>
|
||||
/// <returns>New <see cref="Tokenizer"/> object</returns>
|
||||
public Tokenizer SetInputCol(string value) =>
|
||||
public Tokenizer SetInputCol(string value) =>
|
||||
WrapAsTokenizer(Reference.Invoke("setInputCol", value));
|
||||
|
||||
/// <summary>
|
||||
|
@ -64,7 +67,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">The name of the new column</param>
|
||||
/// <returns>New <see cref="Tokenizer"/> object</returns>
|
||||
public Tokenizer SetOutputCol(string value) =>
|
||||
public Tokenizer SetOutputCol(string value) =>
|
||||
WrapAsTokenizer(Reference.Invoke("setOutputCol", value));
|
||||
|
||||
/// <summary>
|
||||
|
@ -75,7 +78,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <returns>
|
||||
/// New <see cref="DataFrame"/> object with the source <see cref="DataFrame"/> transformed
|
||||
/// </returns>
|
||||
public DataFrame Transform(DataFrame source) =>
|
||||
public override DataFrame Transform(DataFrame source) =>
|
||||
new DataFrame((JvmObjectReference)Reference.Invoke("transform", source));
|
||||
|
||||
/// <summary>
|
||||
|
@ -89,8 +92,29 @@ namespace Microsoft.Spark.ML.Feature
|
|||
SparkEnvironment.JvmBridge.CallStaticJavaMethod(
|
||||
s_tokenizerClassName, "load", path));
|
||||
}
|
||||
|
||||
private static Tokenizer WrapAsTokenizer(object obj) =>
|
||||
|
||||
/// <summary>
|
||||
/// Saves the object so that it can be loaded later using Load. Note that these objects
|
||||
/// can be shared with Scala by Loading or Saving in Scala.
|
||||
/// </summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
public void Save(string path) => Reference.Invoke("save", path);
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLWriter instance.
|
||||
/// </summary>
|
||||
/// <returns>a <see cref="JavaMLWriter"/> instance for this ML instance.</returns>
|
||||
public JavaMLWriter Write() =>
|
||||
new JavaMLWriter((JvmObjectReference)Reference.Invoke("write"));
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLReader instance.
|
||||
/// </summary>
|
||||
/// <returns>an <see cref="JavaMLReader<Tokenizer>"/> instance for this ML instance.</returns>
|
||||
public JavaMLReader<Tokenizer> Read() =>
|
||||
new JavaMLReader<Tokenizer>((JvmObjectReference)Reference.Invoke("read"));
|
||||
|
||||
private static Tokenizer WrapAsTokenizer(object obj) =>
|
||||
new Tokenizer((JvmObjectReference)obj);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
|
||||
namespace Microsoft.Spark.ML.Feature
|
||||
{
|
||||
/// <summary>
|
||||
/// <see cref="JavaTransformer"/> Abstract class for transformers that transform one dataset into another.
|
||||
/// </summary>
|
||||
public abstract class JavaTransformer : JavaPipelineStage
|
||||
{
|
||||
internal JavaTransformer(string className) : base(className)
|
||||
{
|
||||
}
|
||||
|
||||
internal JavaTransformer(string className, string uid) : base(className, uid)
|
||||
{
|
||||
}
|
||||
|
||||
internal JavaTransformer(JvmObjectReference jvmObject) : base(jvmObject)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Executes the transformer and transforms the DataFrame to include new columns.
|
||||
/// </summary>
|
||||
/// <param name="dataset">The Dataframe to be transformed.</param>
|
||||
/// <returns>
|
||||
/// <see cref="DataFrame"/> containing the original data and new columns.
|
||||
/// </returns>
|
||||
public virtual DataFrame Transform(DataFrame dataset) =>
|
||||
new DataFrame((JvmObjectReference)Reference.Invoke("transform", dataset));
|
||||
}
|
||||
}
|
|
@ -8,9 +8,12 @@ using Microsoft.Spark.Sql;
|
|||
|
||||
namespace Microsoft.Spark.ML.Feature
|
||||
{
|
||||
public class Word2Vec : FeatureBase<Word2Vec>
|
||||
public class Word2Vec :
|
||||
JavaEstimator<Word2VecModel>,
|
||||
IJavaMLWritable,
|
||||
IJavaMLReadable<Word2Vec>
|
||||
{
|
||||
private static readonly string s_word2VecClassName =
|
||||
private static readonly string s_word2VecClassName =
|
||||
"org.apache.spark.ml.feature.Word2Vec";
|
||||
|
||||
/// <summary>
|
||||
|
@ -28,11 +31,11 @@ namespace Microsoft.Spark.ML.Feature
|
|||
public Word2Vec(string uid) : base(s_word2VecClassName, uid)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
internal Word2Vec(JvmObjectReference jvmObject) : base(jvmObject)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the column that the <see cref="Word2Vec"/> should read from.
|
||||
/// </summary>
|
||||
|
@ -44,7 +47,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">The name of the column to as the source.</param>
|
||||
/// <returns><see cref="Word2Vec"/></returns>
|
||||
public Word2Vec SetInputCol(string value) =>
|
||||
public Word2Vec SetInputCol(string value) =>
|
||||
WrapAsWord2Vec(Reference.Invoke("setInputCol", value));
|
||||
|
||||
/// <summary>
|
||||
|
@ -60,9 +63,9 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// </summary>
|
||||
/// <param name="value">The name of the output column which will be created.</param>
|
||||
/// <returns>New <see cref="Word2Vec"/></returns>
|
||||
public Word2Vec SetOutputCol(string value) =>
|
||||
public Word2Vec SetOutputCol(string value) =>
|
||||
WrapAsWord2Vec(Reference.Invoke("setOutputCol", value));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Gets the vector size, the dimension of the code that you want to transform from words.
|
||||
/// </summary>
|
||||
|
@ -70,7 +73,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// The vector size, the dimension of the code that you want to transform from words.
|
||||
/// </returns>
|
||||
public int GetVectorSize() => (int)(Reference.Invoke("getVectorSize"));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Sets the vector size, the dimension of the code that you want to transform from words.
|
||||
/// </summary>
|
||||
|
@ -78,7 +81,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// The dimension of the code that you want to transform from words.
|
||||
/// </param>
|
||||
/// <returns><see cref="Word2Vec"/></returns>
|
||||
public Word2Vec SetVectorSize(int value) =>
|
||||
public Word2Vec SetVectorSize(int value) =>
|
||||
WrapAsWord2Vec(Reference.Invoke("setVectorSize", value));
|
||||
|
||||
/// <summary>
|
||||
|
@ -100,9 +103,9 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// vocabulary, the default is 5.
|
||||
/// </param>
|
||||
/// <returns><see cref="Word2Vec"/></returns>
|
||||
public virtual Word2Vec SetMinCount(int value) =>
|
||||
public virtual Word2Vec SetMinCount(int value) =>
|
||||
WrapAsWord2Vec(Reference.Invoke("setMinCount", value));
|
||||
|
||||
|
||||
/// <summary>Gets the maximum number of iterations.</summary>
|
||||
/// <returns>The maximum number of iterations.</returns>
|
||||
public int GetMaxIter() => (int)Reference.Invoke("getMaxIter");
|
||||
|
@ -110,14 +113,14 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <summary>Maximum number of iterations (>= 0).</summary>
|
||||
/// <param name="value">The number of iterations.</param>
|
||||
/// <returns><see cref="Word2Vec"/></returns>
|
||||
public Word2Vec SetMaxIter(int value) =>
|
||||
public Word2Vec SetMaxIter(int value) =>
|
||||
WrapAsWord2Vec(Reference.Invoke("setMaxIter", value));
|
||||
|
||||
/// <summary>
|
||||
/// Gets the maximum length (in words) of each sentence in the input data.
|
||||
/// </summary>
|
||||
/// <returns>The maximum length (in words) of each sentence in the input data.</returns>
|
||||
public virtual int GetMaxSentenceLength() =>
|
||||
public virtual int GetMaxSentenceLength() =>
|
||||
(int)Reference.Invoke("getMaxSentenceLength");
|
||||
|
||||
/// <summary>
|
||||
|
@ -127,13 +130,13 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// The maximum length (in words) of each sentence in the input data.
|
||||
/// </param>
|
||||
/// <returns><see cref="Word2Vec"/></returns>
|
||||
public Word2Vec SetMaxSentenceLength(int value) =>
|
||||
public Word2Vec SetMaxSentenceLength(int value) =>
|
||||
WrapAsWord2Vec(Reference.Invoke("setMaxSentenceLength", value));
|
||||
|
||||
/// <summary>Gets the number of partitions for sentences of words.</summary>
|
||||
/// <returns>The number of partitions for sentences of words.</returns>
|
||||
public int GetNumPartitions() => (int)Reference.Invoke("getNumPartitions");
|
||||
|
||||
|
||||
/// <summary>Sets the number of partitions for sentences of words.</summary>
|
||||
/// <param name="value">
|
||||
/// The number of partitions for sentences of words, default is 1.
|
||||
|
@ -145,7 +148,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <summary>Gets the value that is used for the random seed.</summary>
|
||||
/// <returns>The value that is used for the random seed.</returns>
|
||||
public long GetSeed() => (long)Reference.Invoke("getSeed");
|
||||
|
||||
|
||||
/// <summary>Random seed.</summary>
|
||||
/// <param name="value">The value to use for the random seed.</param>
|
||||
/// <returns><see cref="Word2Vec"/></returns>
|
||||
|
@ -155,7 +158,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <summary>Gets the size to be used for each iteration of optimization.</summary>
|
||||
/// <returns>The size to be used for each iteration of optimization.</returns>
|
||||
public double GetStepSize() => (double)Reference.Invoke("getStepSize");
|
||||
|
||||
|
||||
/// <summary>Step size to be used for each iteration of optimization (> 0).</summary>
|
||||
/// <param name="value">Value to use for the step size.</param>
|
||||
/// <returns><see cref="Word2Vec"/></returns>
|
||||
|
@ -165,7 +168,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <summary>Gets the window size (context words from [-window, window]).</summary>
|
||||
/// <returns>The window size.</returns>
|
||||
public int GetWindowSize() => (int)Reference.Invoke("getWindowSize");
|
||||
|
||||
|
||||
/// <summary>The window size (context words from [-window, window]).</summary>
|
||||
/// <param name="value">
|
||||
/// The window size (context words from [-window, window]), default is 5.
|
||||
|
@ -173,11 +176,11 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <returns><see cref="Word2Vec"/></returns>
|
||||
public Word2Vec SetWindowSize(int value) =>
|
||||
WrapAsWord2Vec(Reference.Invoke("setWindowSize", value));
|
||||
|
||||
|
||||
/// <summary>Fits a model to the input data.</summary>
|
||||
/// <param name="dataFrame">The <see cref="DataFrame"/> to fit the model to.</param>
|
||||
/// <returns><see cref="Word2VecModel"/></returns>
|
||||
public Word2VecModel Fit(DataFrame dataFrame) =>
|
||||
public override Word2VecModel Fit(DataFrame dataFrame) =>
|
||||
new Word2VecModel((JvmObjectReference)Reference.Invoke("fit", dataFrame));
|
||||
|
||||
/// <summary>
|
||||
|
@ -187,8 +190,29 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <returns>New <see cref="Word2Vec"/> object, loaded from path.</returns>
|
||||
public static Word2Vec Load(string path) => WrapAsWord2Vec(
|
||||
SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_word2VecClassName, "load", path));
|
||||
|
||||
private static Word2Vec WrapAsWord2Vec(object obj) =>
|
||||
|
||||
/// <summary>
|
||||
/// Saves the object so that it can be loaded later using Load. Note that these objects
|
||||
/// can be shared with Scala by Loading or Saving in Scala.
|
||||
/// </summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
public void Save(string path) => Reference.Invoke("save", path);
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLWriter instance.
|
||||
/// </summary>
|
||||
/// <returns>a <see cref="JavaMLWriter"/> instance for this ML instance.</returns>
|
||||
public JavaMLWriter Write() =>
|
||||
new JavaMLWriter((JvmObjectReference)Reference.Invoke("write"));
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLReader instance.
|
||||
/// </summary>
|
||||
/// <returns>an <see cref="JavaMLReader<Word2Vec>"/> instance for this ML instance.</returns>
|
||||
public JavaMLReader<Word2Vec> Read() =>
|
||||
new JavaMLReader<Word2Vec>((JvmObjectReference)Reference.Invoke("read"));
|
||||
|
||||
private static Word2Vec WrapAsWord2Vec(object obj) =>
|
||||
new Word2Vec((JvmObjectReference)obj);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,9 +8,12 @@ using Microsoft.Spark.Sql;
|
|||
|
||||
namespace Microsoft.Spark.ML.Feature
|
||||
{
|
||||
public class Word2VecModel : FeatureBase<Word2VecModel>
|
||||
public class Word2VecModel :
|
||||
JavaModel<Word2VecModel>,
|
||||
IJavaMLWritable,
|
||||
IJavaMLReadable<Word2VecModel>
|
||||
{
|
||||
private static readonly string s_word2VecModelClassName =
|
||||
private static readonly string s_word2VecModelClassName =
|
||||
"org.apache.spark.ml.feature.Word2VecModel";
|
||||
|
||||
/// <summary>
|
||||
|
@ -28,18 +31,18 @@ namespace Microsoft.Spark.ML.Feature
|
|||
public Word2VecModel(string uid) : base(s_word2VecModelClassName, uid)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
internal Word2VecModel(JvmObjectReference jvmObject) : base(jvmObject)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Transform a sentence column to a vector column to represent the whole sentence.
|
||||
/// </summary>
|
||||
/// <param name="documentDF"><see cref="DataFrame"/> to transform</param>
|
||||
public DataFrame Transform(DataFrame documentDF) =>
|
||||
public override DataFrame Transform(DataFrame documentDF) =>
|
||||
new DataFrame((JvmObjectReference)Reference.Invoke("transform", documentDF));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Find <paramref name="num"/> number of words whose vector representation most similar to
|
||||
/// the supplied vector. If the supplied vector is the vector representation of a word in
|
||||
|
@ -51,7 +54,7 @@ namespace Microsoft.Spark.ML.Feature
|
|||
/// <param name="num">The number of words to find that are similar to "word"</param>
|
||||
public DataFrame FindSynonyms(string word, int num) =>
|
||||
new DataFrame((JvmObjectReference)Reference.Invoke("findSynonyms", word, num));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Loads the <see cref="Word2VecModel"/> that was previously saved using Save(string).
|
||||
/// </summary>
|
||||
|
@ -62,8 +65,29 @@ namespace Microsoft.Spark.ML.Feature
|
|||
public static Word2VecModel Load(string path) => WrapAsWord2VecModel(
|
||||
SparkEnvironment.JvmBridge.CallStaticJavaMethod(
|
||||
s_word2VecModelClassName, "load", path));
|
||||
|
||||
private static Word2VecModel WrapAsWord2VecModel(object obj) =>
|
||||
|
||||
/// <summary>
|
||||
/// Saves the object so that it can be loaded later using Load. Note that these objects
|
||||
/// can be shared with Scala by Loading or Saving in Scala.
|
||||
/// </summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
public void Save(string path) => Reference.Invoke("save", path);
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLWriter instance.
|
||||
/// </summary>
|
||||
/// <returns>a <see cref="JavaMLWriter"/> instance for this ML instance.</returns>
|
||||
public JavaMLWriter Write() =>
|
||||
new JavaMLWriter((JvmObjectReference)Reference.Invoke("write"));
|
||||
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLReader instance.
|
||||
/// </summary>
|
||||
/// <returns>an <see cref="JavaMLReader<Word2VecModel>"/> instance for this ML instance.</returns>
|
||||
public JavaMLReader<Word2VecModel> Read() =>
|
||||
new JavaMLReader<Word2VecModel>((JvmObjectReference)Reference.Invoke("read"));
|
||||
|
||||
private static Word2VecModel WrapAsWord2VecModel(object obj) =>
|
||||
new Word2VecModel((JvmObjectReference)obj);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using Microsoft.Spark.Interop;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
|
||||
|
@ -52,10 +51,7 @@ namespace Microsoft.Spark.ML.Feature.Param
|
|||
{
|
||||
}
|
||||
|
||||
internal Param(JvmObjectReference jvmObject)
|
||||
{
|
||||
Reference = jvmObject;
|
||||
}
|
||||
internal Param(JvmObjectReference jvmObject) => Reference = jvmObject;
|
||||
|
||||
public JvmObjectReference Reference { get; private set; }
|
||||
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Microsoft.Spark.Interop;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
|
||||
namespace Microsoft.Spark.ML.Feature.Param
|
||||
{
|
||||
/// <summary>
|
||||
/// A param to value map.
|
||||
/// </summary>
|
||||
public class ParamMap : IJvmObjectReferenceProvider
|
||||
{
|
||||
private static readonly string s_ParamMapClassName = "org.apache.spark.ml.param.ParamMap";
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new instance of a <see cref="ParamMap"/>
|
||||
/// </summary>
|
||||
public ParamMap() : this(SparkEnvironment.JvmBridge.CallConstructor(s_ParamMapClassName))
|
||||
{
|
||||
}
|
||||
|
||||
internal ParamMap(JvmObjectReference jvmObject) => Reference = jvmObject;
|
||||
|
||||
public JvmObjectReference Reference { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Puts a (param, value) pair (overwrites if the input param exists).
|
||||
/// </summary>
|
||||
/// <param name="param">The param to be add</param>
|
||||
/// <param name="value">The param value to be add</param>
|
||||
public ParamMap Put<T>(Param param, T value) =>
|
||||
WrapAsParamMap((JvmObjectReference)Reference.Invoke("put", param, value));
|
||||
|
||||
/// <summary>
|
||||
/// Returns the string representation of this ParamMap.
|
||||
/// </summary>
|
||||
/// <returns>representation as string value.</returns>
|
||||
public override string ToString() =>
|
||||
(string)Reference.Invoke("toString");
|
||||
|
||||
private static ParamMap WrapAsParamMap(object obj) =>
|
||||
new ParamMap((JvmObjectReference)obj);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Microsoft.Spark.Interop;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
|
||||
namespace Microsoft.Spark.ML.Feature.Param
|
||||
{
|
||||
/// <summary>
|
||||
/// A param and its value.
|
||||
/// </summary>
|
||||
public sealed class ParamPair<T> : IJvmObjectReferenceProvider
|
||||
{
|
||||
private static readonly string s_ParamPairClassName = "org.apache.spark.ml.param.ParamPair";
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new instance of a <see cref="ParamPair<T>"/>
|
||||
/// </summary>
|
||||
public ParamPair(Param param, T value)
|
||||
: this(SparkEnvironment.JvmBridge.CallConstructor(s_ParamPairClassName, param, value))
|
||||
{
|
||||
}
|
||||
|
||||
internal ParamPair(JvmObjectReference jvmObject) => Reference = jvmObject;
|
||||
|
||||
public JvmObjectReference Reference { get; private set; }
|
||||
}
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.Linq;
|
||||
using System.Reflection;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
using Microsoft.Spark.Sql;
|
||||
|
||||
namespace Microsoft.Spark.ML.Feature
|
||||
{
|
||||
/// <summary>
|
||||
/// Class for utility classes that can load ML instances.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">ML instance type</typeparam>
|
||||
public class JavaMLReader<T> : IJvmObjectReferenceProvider
|
||||
{
|
||||
internal JavaMLReader(JvmObjectReference jvmObject) => Reference = jvmObject;
|
||||
|
||||
public JvmObjectReference Reference { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Loads the ML component from the input path.
|
||||
/// </summary>
|
||||
/// <param name="path">The path the previous instance of type T was saved to</param>
|
||||
/// <returns>The type T instance</returns>
|
||||
public T Load(string path) =>
|
||||
WrapAsType((JvmObjectReference)Reference.Invoke("load", path));
|
||||
|
||||
/// <summary>Sets the Spark Session to use for saving/loading.</summary>
|
||||
/// <param name="sparkSession">The Spark Session to be set</param>
|
||||
public JavaMLReader<T> Session(SparkSession sparkSession)
|
||||
{
|
||||
Reference.Invoke("session", sparkSession);
|
||||
return this;
|
||||
}
|
||||
|
||||
private static T WrapAsType(JvmObjectReference reference)
|
||||
{
|
||||
ConstructorInfo constructor = typeof(T)
|
||||
.GetConstructors(BindingFlags.NonPublic | BindingFlags.Instance)
|
||||
.Single(c =>
|
||||
{
|
||||
ParameterInfo[] parameters = c.GetParameters();
|
||||
return (parameters.Length == 1) &&
|
||||
(parameters[0].ParameterType == typeof(JvmObjectReference));
|
||||
});
|
||||
|
||||
return (T)constructor.Invoke(new object[] { reference });
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for objects that provide MLReader.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">
|
||||
/// ML instance type
|
||||
/// </typeparam>
|
||||
public interface IJavaMLReadable<T>
|
||||
{
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLReader instance.
|
||||
/// </summary>
|
||||
/// <returns>an <see cref="JavaMLReader<T>"/> instance for this ML instance.</returns>
|
||||
JavaMLReader<T> Read();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,73 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
using Microsoft.Spark.Sql;
|
||||
|
||||
namespace Microsoft.Spark.ML.Feature
|
||||
{
|
||||
/// <summary>
|
||||
/// Class for utility classes that can save ML instances in Spark's internal format.
|
||||
/// </summary>
|
||||
public class JavaMLWriter : IJvmObjectReferenceProvider
|
||||
{
|
||||
internal JavaMLWriter(JvmObjectReference jvmObject) => Reference = jvmObject;
|
||||
|
||||
public JvmObjectReference Reference { get; private set; }
|
||||
|
||||
/// <summary>Saves the ML instances to the input path.</summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
public void Save(string path) => Reference.Invoke("save", path);
|
||||
|
||||
/// <summary>
|
||||
/// save() handles overwriting and then calls this method.
|
||||
/// Subclasses should override this method to implement the actual saving of the instance.
|
||||
/// </summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
protected void SaveImpl(string path) => Reference.Invoke("saveImpl", path);
|
||||
|
||||
/// <summary>Overwrites if the output path already exists.</summary>
|
||||
public JavaMLWriter Overwrite()
|
||||
{
|
||||
Reference.Invoke("overwrite");
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds an option to the underlying MLWriter. See the documentation for the specific model's
|
||||
/// writer for possible options. The option name (key) is case-insensitive.
|
||||
/// </summary>
|
||||
/// <param name="key">key of the option</param>
|
||||
/// <param name="value">value of the option</param>
|
||||
public JavaMLWriter Option(string key, string value)
|
||||
{
|
||||
Reference.Invoke("option", key, value);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>Sets the Spark Session to use for saving/loading.</summary>
|
||||
/// <param name="sparkSession">The Spark Session to be set</param>
|
||||
public JavaMLWriter Session(SparkSession sparkSession)
|
||||
{
|
||||
Reference.Invoke("session", sparkSession);
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for classes that provide JavaMLWriter.
|
||||
/// </summary>
|
||||
public interface IJavaMLWritable
|
||||
{
|
||||
/// <summary>
|
||||
/// Get the corresponding JavaMLWriter instance.
|
||||
/// </summary>
|
||||
/// <returns>a <see cref="JavaMLWriter"/> instance for this ML instance.</returns>
|
||||
JavaMLWriter Write();
|
||||
|
||||
/// <summary>Saves this ML instance to the input path</summary>
|
||||
/// <param name="path">The path to save the object to</param>
|
||||
void Save(string path);
|
||||
}
|
||||
}
|
|
@ -33,6 +33,12 @@
|
|||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-mllib_${scala.binary.version}</artifactId>
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* Licensed to the .NET Foundation under one or more agreements.
|
||||
* The .NET Foundation licenses this file to you under the MIT license.
|
||||
* See the LICENSE file in the project root for more information.
|
||||
*/
|
||||
|
||||
package org.apache.spark.api.dotnet
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
/** DotnetUtils object that hosts some helper functions
|
||||
* help data type conversions between dotnet and scala
|
||||
*/
|
||||
object DotnetUtils {
|
||||
|
||||
/** A helper function to convert scala Map to java.util.Map
|
||||
* @param value - scala Map
|
||||
* @return java.util.Map
|
||||
*/
|
||||
def convertToJavaMap(value: Map[_, _]): java.util.Map[_, _] = value.asJava
|
||||
|
||||
/** Convert java data type to corresponding scala type
|
||||
* @param value - java.lang.Object
|
||||
* @return Any
|
||||
*/
|
||||
def mapScalaToJava(value: java.lang.Object): Any = {
|
||||
value match {
|
||||
case i: java.lang.Integer => i.toInt
|
||||
case d: java.lang.Double => d.toDouble
|
||||
case f: java.lang.Float => f.toFloat
|
||||
case b: java.lang.Boolean => b.booleanValue()
|
||||
case l: java.lang.Long => l.toLong
|
||||
case s: java.lang.Short => s.toShort
|
||||
case by: java.lang.Byte => by.toByte
|
||||
case c: java.lang.Character => c.toChar
|
||||
case _ => value
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
|
||||
/*
|
||||
* Licensed to the .NET Foundation under one or more agreements.
|
||||
* The .NET Foundation licenses this file to you under the MIT license.
|
||||
* See the LICENSE file in the project root for more information.
|
||||
*/
|
||||
|
||||
package org.apache.spark.mllib.api.dotnet
|
||||
|
||||
import org.apache.spark.ml._
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
/** MLUtils object that hosts helper functions
|
||||
* related to ML usage
|
||||
*/
|
||||
object MLUtils {
|
||||
|
||||
/** A helper function to let pipeline accept java.util.ArrayList
|
||||
* format stages in scala code
|
||||
* @param pipeline - The pipeline to be set stages
|
||||
* @param value - A java.util.ArrayList of PipelineStages to be set as stages
|
||||
* @return The pipeline
|
||||
*/
|
||||
def setPipelineStages(pipeline: Pipeline, value: java.util.ArrayList[_ <: PipelineStage]): Pipeline =
|
||||
pipeline.setStages(value.asScala.toArray)
|
||||
}
|
|
@ -33,6 +33,12 @@
|
|||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-mllib_${scala.binary.version}</artifactId>
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* Licensed to the .NET Foundation under one or more agreements.
|
||||
* The .NET Foundation licenses this file to you under the MIT license.
|
||||
* See the LICENSE file in the project root for more information.
|
||||
*/
|
||||
|
||||
package org.apache.spark.api.dotnet
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
/** DotnetUtils object that hosts some helper functions
|
||||
* help data type conversions between dotnet and scala
|
||||
*/
|
||||
object DotnetUtils {
|
||||
|
||||
/** A helper function to convert scala Map to java.util.Map
|
||||
* @param value - scala Map
|
||||
* @return java.util.Map
|
||||
*/
|
||||
def convertToJavaMap(value: Map[_, _]): java.util.Map[_, _] = value.asJava
|
||||
|
||||
/** Convert java data type to corresponding scala type
|
||||
* @param value - java.lang.Object
|
||||
* @return Any
|
||||
*/
|
||||
def mapScalaToJava(value: java.lang.Object): Any = {
|
||||
value match {
|
||||
case i: java.lang.Integer => i.toInt
|
||||
case d: java.lang.Double => d.toDouble
|
||||
case f: java.lang.Float => f.toFloat
|
||||
case b: java.lang.Boolean => b.booleanValue()
|
||||
case l: java.lang.Long => l.toLong
|
||||
case s: java.lang.Short => s.toShort
|
||||
case by: java.lang.Byte => by.toByte
|
||||
case c: java.lang.Character => c.toChar
|
||||
case _ => value
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
|
||||
/*
|
||||
* Licensed to the .NET Foundation under one or more agreements.
|
||||
* The .NET Foundation licenses this file to you under the MIT license.
|
||||
* See the LICENSE file in the project root for more information.
|
||||
*/
|
||||
|
||||
package org.apache.spark.mllib.api.dotnet
|
||||
|
||||
import org.apache.spark.ml._
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
/** MLUtils object that hosts helper functions
|
||||
* related to ML usage
|
||||
*/
|
||||
object MLUtils {
|
||||
|
||||
/** A helper function to let pipeline accept java.util.ArrayList
|
||||
* format stages in scala code
|
||||
* @param pipeline - The pipeline to be set stages
|
||||
* @param value - A java.util.ArrayList of PipelineStages to be set as stages
|
||||
* @return The pipeline
|
||||
*/
|
||||
def setPipelineStages(pipeline: Pipeline, value: java.util.ArrayList[_ <: PipelineStage]): Pipeline =
|
||||
pipeline.setStages(value.asScala.toArray)
|
||||
}
|
|
@ -33,6 +33,12 @@
|
|||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-mllib_${scala.binary.version}</artifactId>
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* Licensed to the .NET Foundation under one or more agreements.
|
||||
* The .NET Foundation licenses this file to you under the MIT license.
|
||||
* See the LICENSE file in the project root for more information.
|
||||
*/
|
||||
|
||||
package org.apache.spark.api.dotnet
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
/** DotnetUtils object that hosts some helper functions
|
||||
* help data type conversions between dotnet and scala
|
||||
*/
|
||||
object DotnetUtils {
|
||||
|
||||
/** A helper function to convert scala Map to java.util.Map
|
||||
* @param value - scala Map
|
||||
* @return java.util.Map
|
||||
*/
|
||||
def convertToJavaMap(value: Map[_, _]): java.util.Map[_, _] = value.asJava
|
||||
|
||||
/** Convert java data type to corresponding scala type
|
||||
* @param value - java.lang.Object
|
||||
* @return Any
|
||||
*/
|
||||
def mapScalaToJava(value: java.lang.Object): Any = {
|
||||
value match {
|
||||
case i: java.lang.Integer => i.toInt
|
||||
case d: java.lang.Double => d.toDouble
|
||||
case f: java.lang.Float => f.toFloat
|
||||
case b: java.lang.Boolean => b.booleanValue()
|
||||
case l: java.lang.Long => l.toLong
|
||||
case s: java.lang.Short => s.toShort
|
||||
case by: java.lang.Byte => by.toByte
|
||||
case c: java.lang.Character => c.toChar
|
||||
case _ => value
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
|
||||
/*
|
||||
* Licensed to the .NET Foundation under one or more agreements.
|
||||
* The .NET Foundation licenses this file to you under the MIT license.
|
||||
* See the LICENSE file in the project root for more information.
|
||||
*/
|
||||
|
||||
package org.apache.spark.mllib.api.dotnet
|
||||
|
||||
import org.apache.spark.ml._
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
/** MLUtils object that hosts helper functions
|
||||
* related to ML usage
|
||||
*/
|
||||
object MLUtils {
|
||||
|
||||
/** A helper function to let pipeline accept java.util.ArrayList
|
||||
* format stages in scala code
|
||||
* @param pipeline - The pipeline to be set stages
|
||||
* @param value - A java.util.ArrayList of PipelineStages to be set as stages
|
||||
* @return The pipeline
|
||||
*/
|
||||
def setPipelineStages(pipeline: Pipeline, value: java.util.ArrayList[_ <: PipelineStage]): Pipeline =
|
||||
pipeline.setStages(value.asScala.toArray)
|
||||
}
|
|
@ -33,6 +33,12 @@
|
|||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-mllib_${scala.binary.version}</artifactId>
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* Licensed to the .NET Foundation under one or more agreements.
|
||||
* The .NET Foundation licenses this file to you under the MIT license.
|
||||
* See the LICENSE file in the project root for more information.
|
||||
*/
|
||||
|
||||
package org.apache.spark.api.dotnet
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
/** DotnetUtils object that hosts some helper functions
|
||||
* help data type conversions between dotnet and scala
|
||||
*/
|
||||
object DotnetUtils {
|
||||
|
||||
/** A helper function to convert scala Map to java.util.Map
|
||||
* @param value - scala Map
|
||||
* @return java.util.Map
|
||||
*/
|
||||
def convertToJavaMap(value: Map[_, _]): java.util.Map[_, _] = value.asJava
|
||||
|
||||
/** Convert java data type to corresponding scala type
|
||||
* @param value - java.lang.Object
|
||||
* @return Any
|
||||
*/
|
||||
def mapScalaToJava(value: java.lang.Object): Any = {
|
||||
value match {
|
||||
case i: java.lang.Integer => i.toInt
|
||||
case d: java.lang.Double => d.toDouble
|
||||
case f: java.lang.Float => f.toFloat
|
||||
case b: java.lang.Boolean => b.booleanValue()
|
||||
case l: java.lang.Long => l.toLong
|
||||
case s: java.lang.Short => s.toShort
|
||||
case by: java.lang.Byte => by.toByte
|
||||
case c: java.lang.Character => c.toChar
|
||||
case _ => value
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
|
||||
/*
|
||||
* Licensed to the .NET Foundation under one or more agreements.
|
||||
* The .NET Foundation licenses this file to you under the MIT license.
|
||||
* See the LICENSE file in the project root for more information.
|
||||
*/
|
||||
|
||||
package org.apache.spark.mllib.api.dotnet
|
||||
|
||||
import org.apache.spark.ml._
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
/** MLUtils object that hosts helper functions
|
||||
* related to ML usage
|
||||
*/
|
||||
object MLUtils {
|
||||
|
||||
/** A helper function to let pipeline accept java.util.ArrayList
|
||||
* format stages in scala code
|
||||
* @param pipeline - The pipeline to be set stages
|
||||
* @param value - A java.util.ArrayList of PipelineStages to be set as stages
|
||||
* @return The pipeline
|
||||
*/
|
||||
def setPipelineStages(pipeline: Pipeline, value: java.util.ArrayList[_ <: PipelineStage]): Pipeline =
|
||||
pipeline.setStages(value.asScala.toArray)
|
||||
}
|
Загрузка…
Ссылка в новой задаче