Added CausalityExample, Crowdsourcing, and CrowdsourcingWithWords examples from the user guide. (#67)

Added CausalityExample, Crowdsourcing, and CrowdsourcingWithWords examples from the user guide.
Modernized the BayesianPCA example.
Example projects that support .NET Core now build under DebugCore and ReleaseCore.
Removed spurious app.config files.
This commit is contained in:
Tom Minka 2018-11-03 16:13:33 +00:00 коммит произвёл GitHub
Родитель e8cd27633b
Коммит fcb8a8ec90
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
46 изменённых файлов: 5195 добавлений и 186 удалений

Просмотреть файл

@ -127,7 +127,7 @@ In Runtime project settings > Build:
You can also use other BLAS/LAPACK libraries compatible with MKL. If your library is not called "mkl_rt.dll", change the `dllName` string in [Lapack.cs](https://github.com/dotnet/infer/blob/master/src/Runtime/Core/Maths/Lapack.cs).
When using use this special build of Infer.NET, you must tell your code where to find the MKL dynamic libraries.
When using this special build of Infer.NET, you must tell your code where to find the MKL dynamic libraries.
1. Download [Intel MKL](https://software.intel.com/en-us/mkl/) which includes redistributables. Typically, this is installed in `C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\redist\intel64_win`. We'll reference this folder as *MKL_DIR*.
1. (Optional) Add *MKL_DIR* to the environment variable PATH. If you do this, the remaining steps are unnecessary, but your code will only run on machines where this has been done.
1. Add the MKL dynamic libraries as items to your project that uses Infer.NET.

Просмотреть файл

@ -80,6 +80,10 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tools.BuildFactorDoc", "src
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tools.PrepareSource", "src\Tools\PrepareSource\Tools.PrepareSource.csproj", "{8342B783-EEBE-4DE9-9AA0-77C074E3869E}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Crowdsourcing", "src\Examples\Crowdsourcing\Crowdsourcing.csproj", "{816CD64D-7189-46E3-8C54-D4ED4C0BB758}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CrowdsourcingWithWords", "src\Examples\CrowdsourcingWithWords\CrowdsourcingWithWords.csproj", "{6563C4C6-411E-4D67-B458-830AD9B311D2}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@ -209,11 +213,13 @@ Global
{52D174E7-2407-4FC1-9DDA-4D9D14F18618}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{52D174E7-2407-4FC1-9DDA-4D9D14F18618}.Debug|Any CPU.Build.0 = Debug|Any CPU
{52D174E7-2407-4FC1-9DDA-4D9D14F18618}.DebugCore|Any CPU.ActiveCfg = DebugCore|Any CPU
{52D174E7-2407-4FC1-9DDA-4D9D14F18618}.DebugCore|Any CPU.Build.0 = DebugCore|Any CPU
{52D174E7-2407-4FC1-9DDA-4D9D14F18618}.DebugFull|Any CPU.ActiveCfg = DebugFull|Any CPU
{52D174E7-2407-4FC1-9DDA-4D9D14F18618}.DebugFull|Any CPU.Build.0 = DebugFull|Any CPU
{52D174E7-2407-4FC1-9DDA-4D9D14F18618}.Release|Any CPU.ActiveCfg = Release|Any CPU
{52D174E7-2407-4FC1-9DDA-4D9D14F18618}.Release|Any CPU.Build.0 = Release|Any CPU
{52D174E7-2407-4FC1-9DDA-4D9D14F18618}.ReleaseCore|Any CPU.ActiveCfg = ReleaseCore|Any CPU
{52D174E7-2407-4FC1-9DDA-4D9D14F18618}.ReleaseCore|Any CPU.Build.0 = ReleaseCore|Any CPU
{52D174E7-2407-4FC1-9DDA-4D9D14F18618}.ReleaseFull|Any CPU.ActiveCfg = ReleaseFull|Any CPU
{52D174E7-2407-4FC1-9DDA-4D9D14F18618}.ReleaseFull|Any CPU.Build.0 = ReleaseFull|Any CPU
{6139CF19-0190-4ED5-AEE3-D3CE7458E517}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
@ -229,11 +235,13 @@ Global
{D2A7B5F5-8D33-45AC-9776-07C23F5859BB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{D2A7B5F5-8D33-45AC-9776-07C23F5859BB}.Debug|Any CPU.Build.0 = Debug|Any CPU
{D2A7B5F5-8D33-45AC-9776-07C23F5859BB}.DebugCore|Any CPU.ActiveCfg = DebugCore|Any CPU
{D2A7B5F5-8D33-45AC-9776-07C23F5859BB}.DebugCore|Any CPU.Build.0 = DebugCore|Any CPU
{D2A7B5F5-8D33-45AC-9776-07C23F5859BB}.DebugFull|Any CPU.ActiveCfg = DebugFull|Any CPU
{D2A7B5F5-8D33-45AC-9776-07C23F5859BB}.DebugFull|Any CPU.Build.0 = DebugFull|Any CPU
{D2A7B5F5-8D33-45AC-9776-07C23F5859BB}.Release|Any CPU.ActiveCfg = Release|Any CPU
{D2A7B5F5-8D33-45AC-9776-07C23F5859BB}.Release|Any CPU.Build.0 = Release|Any CPU
{D2A7B5F5-8D33-45AC-9776-07C23F5859BB}.ReleaseCore|Any CPU.ActiveCfg = ReleaseCore|Any CPU
{D2A7B5F5-8D33-45AC-9776-07C23F5859BB}.ReleaseCore|Any CPU.Build.0 = ReleaseCore|Any CPU
{D2A7B5F5-8D33-45AC-9776-07C23F5859BB}.ReleaseFull|Any CPU.ActiveCfg = ReleaseFull|Any CPU
{D2A7B5F5-8D33-45AC-9776-07C23F5859BB}.ReleaseFull|Any CPU.Build.0 = ReleaseFull|Any CPU
{87D09BD4-119E-49C1-B0B4-86DF962A00EE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
@ -249,11 +257,13 @@ Global
{6FF3E672-378C-4D61-B4CA-A5A5E01C2563}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{6FF3E672-378C-4D61-B4CA-A5A5E01C2563}.Debug|Any CPU.Build.0 = Debug|Any CPU
{6FF3E672-378C-4D61-B4CA-A5A5E01C2563}.DebugCore|Any CPU.ActiveCfg = DebugCore|Any CPU
{6FF3E672-378C-4D61-B4CA-A5A5E01C2563}.DebugCore|Any CPU.Build.0 = DebugCore|Any CPU
{6FF3E672-378C-4D61-B4CA-A5A5E01C2563}.DebugFull|Any CPU.ActiveCfg = DebugFull|Any CPU
{6FF3E672-378C-4D61-B4CA-A5A5E01C2563}.DebugFull|Any CPU.Build.0 = DebugFull|Any CPU
{6FF3E672-378C-4D61-B4CA-A5A5E01C2563}.Release|Any CPU.ActiveCfg = Release|Any CPU
{6FF3E672-378C-4D61-B4CA-A5A5E01C2563}.Release|Any CPU.Build.0 = Release|Any CPU
{6FF3E672-378C-4D61-B4CA-A5A5E01C2563}.ReleaseCore|Any CPU.ActiveCfg = ReleaseCore|Any CPU
{6FF3E672-378C-4D61-B4CA-A5A5E01C2563}.ReleaseCore|Any CPU.Build.0 = ReleaseCore|Any CPU
{6FF3E672-378C-4D61-B4CA-A5A5E01C2563}.ReleaseFull|Any CPU.ActiveCfg = ReleaseFull|Any CPU
{6FF3E672-378C-4D61-B4CA-A5A5E01C2563}.ReleaseFull|Any CPU.Build.0 = ReleaseFull|Any CPU
{5B669C82-B04C-4DD6-8CE6-47D025D98777}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
@ -422,6 +432,30 @@ Global
{8342B783-EEBE-4DE9-9AA0-77C074E3869E}.ReleaseCore|Any CPU.Build.0 = Release|Any CPU
{8342B783-EEBE-4DE9-9AA0-77C074E3869E}.ReleaseFull|Any CPU.ActiveCfg = Release|Any CPU
{8342B783-EEBE-4DE9-9AA0-77C074E3869E}.ReleaseFull|Any CPU.Build.0 = Release|Any CPU
{816CD64D-7189-46E3-8C54-D4ED4C0BB758}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{816CD64D-7189-46E3-8C54-D4ED4C0BB758}.Debug|Any CPU.Build.0 = Debug|Any CPU
{816CD64D-7189-46E3-8C54-D4ED4C0BB758}.DebugCore|Any CPU.ActiveCfg = DebugCore|Any CPU
{816CD64D-7189-46E3-8C54-D4ED4C0BB758}.DebugCore|Any CPU.Build.0 = DebugCore|Any CPU
{816CD64D-7189-46E3-8C54-D4ED4C0BB758}.DebugFull|Any CPU.ActiveCfg = DebugFull|Any CPU
{816CD64D-7189-46E3-8C54-D4ED4C0BB758}.DebugFull|Any CPU.Build.0 = DebugFull|Any CPU
{816CD64D-7189-46E3-8C54-D4ED4C0BB758}.Release|Any CPU.ActiveCfg = Release|Any CPU
{816CD64D-7189-46E3-8C54-D4ED4C0BB758}.Release|Any CPU.Build.0 = Release|Any CPU
{816CD64D-7189-46E3-8C54-D4ED4C0BB758}.ReleaseCore|Any CPU.ActiveCfg = ReleaseCore|Any CPU
{816CD64D-7189-46E3-8C54-D4ED4C0BB758}.ReleaseCore|Any CPU.Build.0 = ReleaseCore|Any CPU
{816CD64D-7189-46E3-8C54-D4ED4C0BB758}.ReleaseFull|Any CPU.ActiveCfg = ReleaseFull|Any CPU
{816CD64D-7189-46E3-8C54-D4ED4C0BB758}.ReleaseFull|Any CPU.Build.0 = ReleaseFull|Any CPU
{6563C4C6-411E-4D67-B458-830AD9B311D2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{6563C4C6-411E-4D67-B458-830AD9B311D2}.Debug|Any CPU.Build.0 = Debug|Any CPU
{6563C4C6-411E-4D67-B458-830AD9B311D2}.DebugCore|Any CPU.ActiveCfg = DebugCore|Any CPU
{6563C4C6-411E-4D67-B458-830AD9B311D2}.DebugCore|Any CPU.Build.0 = DebugCore|Any CPU
{6563C4C6-411E-4D67-B458-830AD9B311D2}.DebugFull|Any CPU.ActiveCfg = DebugFull|Any CPU
{6563C4C6-411E-4D67-B458-830AD9B311D2}.DebugFull|Any CPU.Build.0 = DebugFull|Any CPU
{6563C4C6-411E-4D67-B458-830AD9B311D2}.Release|Any CPU.ActiveCfg = Release|Any CPU
{6563C4C6-411E-4D67-B458-830AD9B311D2}.Release|Any CPU.Build.0 = Release|Any CPU
{6563C4C6-411E-4D67-B458-830AD9B311D2}.ReleaseCore|Any CPU.ActiveCfg = ReleaseCore|Any CPU
{6563C4C6-411E-4D67-B458-830AD9B311D2}.ReleaseCore|Any CPU.Build.0 = ReleaseCore|Any CPU
{6563C4C6-411E-4D67-B458-830AD9B311D2}.ReleaseFull|Any CPU.ActiveCfg = ReleaseFull|Any CPU
{6563C4C6-411E-4D67-B458-830AD9B311D2}.ReleaseFull|Any CPU.Build.0 = ReleaseFull|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
@ -445,6 +479,8 @@ Global
{8D4D5502-4321-46A5-975B-C20BD745FC06} = {2964BB90-4E6D-49ED-AA35-645D94337C76}
{E2409457-2BA1-47C8-B53B-CE712896FE6E} = {2964BB90-4E6D-49ED-AA35-645D94337C76}
{7A774F1F-31D6-4D7F-90D5-9C4F387D2EEE} = {2964BB90-4E6D-49ED-AA35-645D94337C76}
{816CD64D-7189-46E3-8C54-D4ED4C0BB758} = {DC5F5BC4-CDB0-41F7-8B03-CD4C38C8DEB2}
{6563C4C6-411E-4D67-B458-830AD9B311D2} = {DC5F5BC4-CDB0-41F7-8B03-CD4C38C8DEB2}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {160F773C-9CF5-4F8D-B45A-1112A1BC5E16}

Просмотреть файл

@ -29,7 +29,7 @@ and many others.
## Installing pre-built binaries
Binaries for Infer.NET are located on [nuget.org](https://www.nuget.org/packages?q=Microsoft.ML.Probabilistic). These binaries are cross-platform and work anywhere that .NET is supported, so there is no need to select your platform. You do not need to clone the GitHub repository to use the pre-built binaries.
Binaries for Infer.NET are located on [nuget.org](https://www.nuget.org/packages?q=Microsoft.ML.Probabilistic). These binaries are cross-platform and work anywhere that .NET is supported, so there is no need to select your platform. The core packages target .NET Standard 2.0, making them useable from any project that targets .NET framework version 4.6.1 or .NET Core 2.1, as explained at [.NET implementation support](https://docs.microsoft.com/en-us/dotnet/standard/net-standard). You do not need to clone the GitHub repository to use the pre-built binaries.
There currently are [four maintained Infer.NET nuget packages](https://www.nuget.org/packages?q=Microsoft.ML.Probabilistic):

Просмотреть файл

@ -116,7 +116,6 @@
<DependentUpon>Resources.resx</DependentUpon>
<DesignTime>True</DesignTime>
</Compile>
<None Include="app.config" />
<None Include="Properties\Settings.settings">
<Generator>SettingsSingleFileGenerator</Generator>
<LastGenOutput>Settings.Designer.cs</LastGenOutput>

Просмотреть файл

@ -1,3 +0,0 @@
<?xml version="1.0"?>
<configuration>
<startup><supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2"/></startup></configuration>

Просмотреть файл

@ -150,7 +150,6 @@
<LastGenOutput>Resources.Designer.cs</LastGenOutput>
<SubType>Designer</SubType>
</EmbeddedResource>
<None Include="app.config" />
<None Include="Properties\Settings.settings">
<Generator>SettingsSingleFileGenerator</Generator>
<LastGenOutput>Settings.Designer.cs</LastGenOutput>

Просмотреть файл

@ -1,3 +0,0 @@
<?xml version="1.0"?>
<configuration>
<startup><supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2"/></startup></configuration>

Просмотреть файл

@ -0,0 +1,426 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Microsoft.ML.Probabilistic;
using Microsoft.ML.Probabilistic.Algorithms;
using Microsoft.ML.Probabilistic.Factors;
using Microsoft.ML.Probabilistic.Distributions;
using Microsoft.ML.Probabilistic.Math;
using Microsoft.ML.Probabilistic.Models;
using Microsoft.ML.Probabilistic.Utilities;
namespace Crowdsourcing
{
/// <summary>
/// Provides the functions to reproduce an iterative labelling selection process with `enums` for the various task selection methods (`TaskSelectionMethod`) and worker selection methods (`WorkerSelectionMethod`).
/// </summary>
public class ActiveLearning
{
/// <summary>
/// Full list of simulated data from every task and worker
/// It is used for initialising the BCC and the CBCC model.
/// </summary>
IList<Datum> PredictionData;
/// <summary>
/// Flag to indicate whether the model instance is CBCC (true) or BCC (false).
/// </summary>
bool IsCommunityModel;
/// <summary>
/// List of worker ids.
/// </summary>
string[] WorkerIds;
/// <summary>
/// List of task ids.
/// </summary>
string[] TaskIds;
/// <summary>
/// Model instance.
/// </summary>
BCC bcc;
/// <summary>
/// Result instance for active learning.
/// </summary>
Results ActiveLearningResults;
/// <summary>
/// Result instance for batch training.
/// </summary>
Results BatchResults;
/// <summary>
/// Constructs an active learning instance with a specified data set and model instance.
/// </summary>
/// <param name="data">The data.</param>
/// <param name="model">The model instance.</param>
/// <param name="results">The results instance.</param>
/// <param name="numCommunities">The number of communities (only for CBCC).</param>
public ActiveLearning(IList<Datum> data, BCC model, Results results, int numCommunities)
{
this.bcc = model;
CommunityModel communityModel = model as CommunityModel;
IsCommunityModel = (communityModel != null);
ActiveLearningResults = results;
BatchResults = results;
WorkerIds = ActiveLearningResults.Mapping.WorkerIdToIndex.Keys.ToArray();
TaskIds = ActiveLearningResults.Mapping.TaskIdToIndex.Keys.ToArray();
/// Builds the full matrix of data from every task and worker
PredictionData = new List<Datum>();
foreach (var workerId in WorkerIds)
{
foreach (var task in TaskIds)
{
PredictionData.Add(new Datum
{
TaskId = task,
WorkerId = workerId,
WorkerLabel = 0,
GoldLabel = null
});
}
}
}
/// <summary>
/// Updates the active learning results object.
/// </summary>
/// <param name="results">The new results</param>
public void UpdateActiveLearningResults(Results results)
{
ActiveLearningResults = results;
}
/// <summary>
/// Computes the entropy on the true label posterior distribution of the active learning results.
/// </summary>
/// <returns>A dictionary keyed by the TaskId and the value is the true label entropy.</returns>
public Dictionary<string, ActiveLearningResult> EntropyTrueLabelPosterior()
{
return BatchResults.TrueLabel.ToDictionary(kvp => kvp.Key, kvp => new ActiveLearningResult
{
TaskId = kvp.Key,
TaskValue = kvp.Value == null ? double.MaxValue : -kvp.Value.GetAverageLog(kvp.Value)
});
}
/// <summary>
/// Runs the standard active learning procedure on a model instance and an input data set.
/// </summary>
/// <param name="data">The data.</param>
/// <param name="modelName">The model name.</param>
/// <param name="runType">The model run type.</param>
/// <param name="model">The model instance.</param>
/// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param>
/// <param name="workerSelectionMethod">The method for selecting workers (only Random is implemented).</param>
/// <param name="resultsDir">The directory to save the log files.</param>
/// <param name="communityCount">The number of communities (only for CBCC).</param>
/// <param name="initialNumLabelsPerTask">The initial number of exploratory labels that are randomly selected for each task.</param>
public static void RunActiveLearning(IList<Datum> data, string modelName, RunType runType, BCC model, TaskSelectionMethod taskSelectionMethod, WorkerSelectionMethod workerSelectionMethod, string resultsDir, int communityCount = -1, int initialNumLabelsPerTask = 1)
{
//Count elapsed time
Stopwatch stopWatchTotal = new Stopwatch();
stopWatchTotal.Start();
int totalLabels = data.Count();
// Dictionary keyed by task Id, with randomly order labelings
var groupedRandomisedData =
data.GroupBy(d => d.TaskId).
Select(g =>
{
var arr = g.ToArray();
int cnt = arr.Length;
var perm = Rand.Perm(cnt);
return new
{
key = g.Key,
arr = g.Select((t, i) => arr[perm[i]]).ToArray()
};
}).ToDictionary(a => a.key, a => a.arr);
// Dictionary keyed by task Id, with label counts
Dictionary<string, int> totalCounts = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.Length);
Dictionary<string, int> currentCounts = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => initialNumLabelsPerTask);
// Keyed by task, value is a HashSet containing all the remaining workers with a label - workers are removed after adding a new datum
Dictionary<string, HashSet<string>> remainingWorkersPerTask = groupedRandomisedData.ToDictionary(kvp => kvp.Key, kvp => new HashSet<string>(kvp.Value.Select(dat => dat.WorkerId)));
int numTaskIds = totalCounts.Count();
int totalInstances = data.Count - initialNumLabelsPerTask * numTaskIds;
string[] WorkerIds = data.Select(d => d.WorkerId).Distinct().ToArray();
// Log structures
List<double> accuracy = new List<double>();
List<double> nlpd = new List<double>();
List<double> avgRecall = new List<double>();
List<ActiveLearningResult> taskValueList = new List<ActiveLearningResult>();
int index = 0;
Console.WriteLine("Active Learning: {0}", modelName);
Console.WriteLine("\t\tAcc\tAvgRec");
// Get initial data
Results results = new Results();
List<Datum> subData = null;
subData = GetSubdata(groupedRandomisedData, currentCounts, remainingWorkersPerTask);
var s = remainingWorkersPerTask.Select(w => w.Value.Count).Sum();
List<Datum> nextData = null;
int numIncremData = 3;
ActiveLearning activeLearning = null;
for (int iter = 0; iter < 500; iter++)
{
bool calculateAccuracy = true;
////bool doSnapShot = iter % 100 == 0; // Frequency of snapshots
bool doSnapShot = true;
if (subData != null || nextData != null)
{
switch (runType)
{
case RunType.VoteDistribution:
results.RunMajorityVote(subData, calculateAccuracy, true);
break;
case RunType.MajorityVote:
results.RunMajorityVote(subData, calculateAccuracy, false);
break;
case RunType.DawidSkene:
results.RunDawidSkene(subData, calculateAccuracy);
break;
default: // Run BCC models
results.RunBCC(modelName, subData, data, model, Results.RunMode.ClearResults, calculateAccuracy, communityCount, false);
break;
}
}
if (activeLearning == null)
{
activeLearning = new ActiveLearning(data, model, results, communityCount);
}
else
{
activeLearning.UpdateActiveLearningResults(results);
}
// Select next task
Dictionary<string, ActiveLearningResult> TaskValue = null;
List<Tuple<string, string, ActiveLearningResult>> LabelValue = null;
switch (taskSelectionMethod)
{
case TaskSelectionMethod.EntropyTask:
TaskValue = activeLearning.EntropyTrueLabelPosterior();
break;
case TaskSelectionMethod.RandomTask:
TaskValue = data.GroupBy(d => d.TaskId).ToDictionary(a => a.Key, a => new ActiveLearningResult
{
TaskValue = Rand.Double()
});
break;
default: // Entropy task selection
TaskValue = activeLearning.EntropyTrueLabelPosterior();
break;
}
nextData = GetNextData(groupedRandomisedData, TaskValue, currentCounts, totalCounts, numIncremData);
if (nextData == null || nextData.Count == 0)
break;
index += nextData.Count;
subData.AddRange(nextData);
// Logs
if (calculateAccuracy)
{
accuracy.Add(results.Accuracy);
nlpd.Add(results.NegativeLogProb);
avgRecall.Add(results.AvgRecall);
if (TaskValue == null)
{
var sortedLabelValue = LabelValue.OrderByDescending(kvp => kvp.Item3.TaskValue).ToArray();
taskValueList.Add(sortedLabelValue.First().Item3);
}
else
{
taskValueList.Add(TaskValue[nextData.First().TaskId]);
}
if (doSnapShot)
{
Console.WriteLine("{0} of {1}:\t{2:0.000}\t{3:0.0000}", index, totalInstances, accuracy.Last(), avgRecall.Last());
DoSnapshot(accuracy, nlpd, avgRecall, taskValueList, results, modelName, "interim", resultsDir);
}
}
}
stopWatchTotal.Stop();
DoSnapshot(accuracy, nlpd, avgRecall, taskValueList, results, modelName, "final", resultsDir);
Console.WriteLine("Elapsed time: {0}\n", stopWatchTotal.Elapsed);
}
/// <summary>
/// Saves the results of the inference and the model's parameters on csv files.
/// </summary>
/// <param name="accuracy">The list of accuracies evaluated on the gold labels at each active learning round.</param>
/// <param name="nlpd">The list of NLPD scores evaluated on the gold labels at each active learning round.</param>
/// <param name="avgRecall">The list of average recalls evaluated on the gold labels at each active learning round.</param>
/// <param name="taskValue">The list of utilities of the task selected at each active learning round.</param>
/// <param name="results">The result instance.</param>
/// <param name="modelName">The model name.</param>
/// <param name="suffix">The suffix of the csv files.</param>
/// <param name="resultsDir">The directory to store the csv files.</param>
public static void DoSnapshot(List<double> accuracy, List<double> nlpd, List<double> avgRecall, List<ActiveLearningResult> taskValue, Results results, string modelName, string suffix, string resultsDir)
{
// Snapshot of accuracies, parameters and taskValues.
using (StreamWriter writer = new StreamWriter(String.Format("{2}{0}_graph_{1}.csv", modelName, suffix, resultsDir)))
{
var accArr = accuracy.ToArray();
var nlpdArr = nlpd.ToArray();
var avgRec = avgRecall.ToArray();
for (int i = 0; i < accArr.Length; i++)
{
writer.WriteLine("{0:0.0000}", accArr[i]);
writer.WriteLine("{0:0.0000},{1:0.0000}", accArr[i], avgRec[i]);
writer.WriteLine("{0:0.0000},{1:0.0000}", accArr[i], nlpdArr[i]);
}
}
using (StreamWriter writer = new StreamWriter(String.Format("{2}{0}_parameters_{1}.csv", modelName, suffix, resultsDir)))
{
results.WriteResults(writer, true, true, true);
}
using (StreamWriter writer = new StreamWriter(String.Format("{2}{0}_taskValue_{1}.csv", modelName, suffix, resultsDir)))
{
for (int i = 0; i < taskValue.Count; i++)
{
writer.WriteLine(String.Format("{0}\t{1}\t{2:0.000}", taskValue[i].TaskId, taskValue[i].WorkerId, taskValue[i].TaskValue));
}
}
}
/// <summary>
/// Returns a list of sub-data selected sequentially from the input data list.
/// </summary>
/// <param name="groupedRandomisedData">The randomised data.</param>
/// <param name="currentCounts">The current data count per task.</param>
/// <param name="workersPerTask">The dictionary keyed by taskId and the value is an hashset of workerId who have remaining labels for the tasks.</param>
/// <returns>The list of sub-data.</returns>
public static List<Datum> GetSubdata(Dictionary<string, Datum[]> groupedRandomisedData, Dictionary<string, int> currentCounts, Dictionary<string, HashSet<string>> workersPerTask)
{
var data = groupedRandomisedData.Select(g => g.Value.Take(currentCounts[g.Key])).SelectMany(d => d).ToList();
foreach (Datum d in data)
{
workersPerTask[d.TaskId].Remove(d.WorkerId);
}
return data;
}
/// <summary>
/// Return the list of sub-data for the task with the highest utility.
/// </summary>
/// <param name="groupedRandomisedData">The randomised data.</param>
/// <param name="taskValue">The dictionary keyed by taskId and the value is an active learning result instance.</param>
/// <param name="currentCounts">The current data count per task.</param>
/// <param name="totalCounts">The total data count for all the tasks.</param>
/// <param name="numIncremData">The number of data to be selected.</param>
/// <returns>The list of sub-data.</returns>
public static List<Datum> GetNextData(
Dictionary<string, Datum[]> groupedRandomisedData,
Dictionary<string, ActiveLearningResult> taskValue,
Dictionary<string, int> currentCounts,
Dictionary<string, int> totalCounts,
int numIncremData)
{
List<Datum> data = new List<Datum>();
var sortedTaskValues = taskValue.OrderByDescending(kvp => kvp.Value.TaskValue).ToArray();
if (numIncremData > sortedTaskValues.Length)
numIncremData = sortedTaskValues.Length;
int numAdded = 0;
for (; ; )
{
bool noMoreData = currentCounts.All(kvp => kvp.Value >= totalCounts[kvp.Key]);
if (noMoreData)
break;
for (int i = 0; i < sortedTaskValues.Length; i++)
{
var task = sortedTaskValues[i].Key;
int index = currentCounts[task];
if (index >= totalCounts[task])
continue;
data.Add(groupedRandomisedData[task][index]);
currentCounts[task] = index + 1;
if (++numAdded >= numIncremData)
return data;
}
}
return data;
}
/// <summary>
/// Active learning results class with instances representing
/// pairs of tasks and workers with their utility value.
/// </summary>
public class ActiveLearningResult
{
/// <summary>
/// The task id.
/// </summary>
public string TaskId
{
get;
set;
}
/// <summary>
/// The worker id.
/// </summary>
public string WorkerId
{
get;
set;
}
/// <summary>
/// The utility of a label provided by the worker for the task.
/// </summary>
public double TaskValue
{
get;
set;
}
}
}
/// <summary>
/// Methods for selecting tasks.
/// </summary>
public enum TaskSelectionMethod
{
RandomTask,
EntropyTask,
}
/// <summary>
/// Methods for selecting workers
/// </summary>
public enum WorkerSelectionMethod
{
RandomWorker
}
}

Просмотреть файл

@ -0,0 +1,328 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using Microsoft.ML.Probabilistic;
using Microsoft.ML.Probabilistic.Algorithms;
using Microsoft.ML.Probabilistic.Distributions;
using Microsoft.ML.Probabilistic.Math;
using Microsoft.ML.Probabilistic.Models;
using Microsoft.ML.Probabilistic.Utilities;
namespace Crowdsourcing
{
/// <summary>
/// The BCC model class.
/// </summary>
public class BCC
{
/// <summary>
/// The number of label values.
/// </summary>
public int LabelCount
{
get
{
return c == null ? 0 : c.SizeAsInt;
}
}
/// <summary>
/// The number of tasks.
/// </summary>
public int TaskCount
{
get
{
return n == null ? 0 : n.SizeAsInt;
}
}
// Ranges
protected Range n;
protected Range k;
protected Range c;
protected Range kn;
// Variables in the model
protected Variable<int> WorkerCount;
protected VariableArray<int> TrueLabel;
protected VariableArray<int> WorkerTaskCount;
protected VariableArray<VariableArray<int>, int[][]> WorkerTaskIndex;
protected VariableArray<VariableArray<int>, int[][]> WorkerLabel;
protected Variable<Vector> BackgroundLabelProb;
protected VariableArray<VariableArray<Vector>, Vector[][]> WorkerConfusionMatrix;
protected Variable<bool> Evidence;
// Prior distributions
protected Variable<Dirichlet> BackgroundLabelProbPrior;
protected VariableArray<VariableArray<Dirichlet>, Dirichlet[][]> ConfusionMatrixPrior;
protected VariableArray<Discrete> TrueLabelConstraint;
protected Variable<Bernoulli> EvidencePrior;
// Inference engine
protected InferenceEngine Engine;
// Hyperparameters and inference settings
public double InitialWorkerBelief
{
get;
set;
}
/// <summary>
/// The number of inference iterations.
/// </summary>
public int NumberOfIterations
{
get;
set;
}
/// <summary>
/// Creates a BCC model instance.
/// </summary>
public BCC()
{
InitialWorkerBelief = 0.5;
NumberOfIterations = 35;
EvidencePrior = new Bernoulli(0.5);
}
/// <summary>
/// Initializes the ranges, the generative process and the inference engine of the BCC model.
/// </summary>
/// <param name="taskCount">The number of tasks.</param>
/// <param name="labelCount">The number of labels.</param>
public virtual void CreateModel(int taskCount, int labelCount)
{
Evidence = Variable<bool>.Random(this.EvidencePrior);
var evidenceBlock = Variable.If(Evidence);
DefineVariablesAndRanges(taskCount, labelCount);
DefineGenerativeProcess();
DefineInferenceEngine();
evidenceBlock.CloseBlock();
}
/// <summary>
/// Initializes the ranges of the variables.
/// </summary>
/// <param name="taskCount">The number of tasks.</param>
/// <param name="labelCount">The number of labels.</param>
protected virtual void DefineVariablesAndRanges(int taskCount, int labelCount)
{
WorkerCount = Variable.New<int>().Named("WorkerCount");
n = new Range(taskCount).Named("n");
c = new Range(labelCount).Named("c");
k = new Range(WorkerCount).Named("k");
// The tasks for each worker
WorkerTaskCount = Variable.Array<int>(k).Named("WorkerTaskCount");
kn = new Range(WorkerTaskCount[k]).Named("kn");
WorkerTaskIndex = Variable.Array(Variable.Array<int>(kn), k).Named("WorkerTaskIndex");
WorkerTaskIndex.SetValueRange(n);
WorkerLabel = Variable.Array(Variable.Array<int>(kn), k).Named("WorkerLabel");
// The background probability vector
BackgroundLabelProbPrior = Variable.New<Dirichlet>().Named("BackgroundLabelProbPrior");
BackgroundLabelProb = Variable<Vector>.Random(BackgroundLabelProbPrior).Named("BackgroundLabelProb");
BackgroundLabelProb.SetValueRange(c);
// The confusion matrices for each worker
ConfusionMatrixPrior = Variable.Array(Variable.Array<Dirichlet>(c), k).Named("ConfusionMatrixPrior");
WorkerConfusionMatrix = Variable.Array(Variable.Array<Vector>(c), k).Named("ConfusionMatrix");
WorkerConfusionMatrix[k][c] = Variable<Vector>.Random(ConfusionMatrixPrior[k][c]);
WorkerConfusionMatrix.SetValueRange(c);
// The unobserved 'true' label for each task
TrueLabel = Variable.Array<int>(n).Attrib(QueryTypes.Marginal).Attrib(QueryTypes.MarginalDividedByPrior).Named("Truth");
TrueLabelConstraint = Variable.Array<Discrete>(n).Named("TruthConstraint");
// Constraint for online learning
TrueLabel[n] = Variable.Discrete(BackgroundLabelProb).ForEach(n);
Variable.ConstrainEqualRandom(TrueLabel[n], TrueLabelConstraint[n]);
// The worker labels
WorkerLabel = Variable.Array(Variable.Array<int>(kn), k).Named("WorkerLabel");
}
/// <summary>
/// Defines the BCC generative process.
/// </summary>
protected virtual void DefineGenerativeProcess()
{
// The process that generates the worker's label
using (Variable.ForEach(k))
{
var trueLabel = Variable.Subarray(TrueLabel, WorkerTaskIndex[k]);
trueLabel.SetValueRange(c);
using (Variable.ForEach(kn))
{
using (Variable.Switch(trueLabel[kn]))
{
WorkerLabel[k][kn] = Variable.Discrete(WorkerConfusionMatrix[k][trueLabel[kn]]);
}
}
}
}
/// <summary>
/// Initializes the BCC inference engine.
/// </summary>
protected virtual void DefineInferenceEngine()
{
Engine = new InferenceEngine(new ExpectationPropagation());
Engine.Compiler.UseParallelForLoops = true;
Engine.ShowProgress = false;
Engine.Compiler.WriteSourceFiles = false;
}
/// <summary>
/// Sets the priors of BCC.
/// </summary>
/// <param name="workerCount">The number of workers.</param>
/// <param name="priors">The priors.</param>
protected virtual void SetPriors(int workerCount, Posteriors priors)
{
int numClasses = c.SizeAsInt;
WorkerCount.ObservedValue = workerCount;
if (priors == null)
{
BackgroundLabelProbPrior.ObservedValue = Dirichlet.Uniform(numClasses);
var confusionMatrixPrior = GetConfusionMatrixPrior();
ConfusionMatrixPrior.ObservedValue = Util.ArrayInit(workerCount, worker => Util.ArrayInit(numClasses, lab => confusionMatrixPrior[lab]));
TrueLabelConstraint.ObservedValue = Util.ArrayInit(TaskCount, t => Discrete.Uniform(numClasses));
}
else
{
BackgroundLabelProbPrior.ObservedValue = priors.BackgroundLabelProb;
ConfusionMatrixPrior.ObservedValue = priors.WorkerConfusionMatrix;
TrueLabelConstraint.ObservedValue = priors.TrueLabelConstraint;
}
}
/// <summary>
/// Attachs the data to the workers labels.
/// </summary>
/// <param name="taskIndices">The matrix of the task indices (columns) of each worker (rows).</param>
/// <param name="workerLabels">The matrix of the labels (columns) of each worker (rows).</param>
protected virtual void AttachData(int[][] taskIndices, int[][] workerLabels)
{
AttachData(taskIndices, workerLabels, null);
}
/// <summary>
/// Attachs the data to the workers labels with and sets the workers' confusion matrix priors.
/// </summary>
/// <param name="taskIndices">The matrix of the task indices (columns) of each worker (rows).</param>
/// <param name="workerLabels">The matrix of the labels (columns) of each worker (rows).</param>
/// <param name="confusionMatrixPrior">The workers' confusion matrix priors.</param>
protected virtual void AttachData(int[][] taskIndices, int[][] workerLabels, Dirichlet[][] confusionMatrixPrior)
{
int numClasses = c.SizeAsInt;
WorkerCount.ObservedValue = taskIndices.Length;
WorkerTaskCount.ObservedValue = taskIndices.Select(tasks => tasks.Length).ToArray();
WorkerTaskIndex.ObservedValue = taskIndices;
// Prediction mode is indicated by none of the workers having a label.
// We can just look at the first one
if (workerLabels[0] != null)
{
WorkerLabel.ObservedValue = workerLabels;
}
else
{
WorkerLabel.ClearObservedValue();
}
if (confusionMatrixPrior != null)
{
ConfusionMatrixPrior.ObservedValue = Util.ArrayInit(confusionMatrixPrior.Length, worker => Util.ArrayInit(numClasses, lab => confusionMatrixPrior[worker][lab]));
}
}
/// <summary>
/// Infers the posteriors of BCC using the attached data and priors.
/// </summary>
/// <param name="taskIndices">The matrix of the task indices (columns) of each worker (rows).</param>
/// <param name="workerLabels">The matrix of the labels (columns) of each worker (rows).</param>
/// <param name="priors">The priors of the BCC parameters.</param>
/// <returns></returns>
public virtual Posteriors Infer(int[][] taskIndices, int[][] workerLabels, Posteriors priors)
{
int workerCount = workerLabels.Length;
SetPriors(workerCount, priors);
AttachData(taskIndices, workerLabels, null);
var result = new Posteriors();
Engine.NumberOfIterations = NumberOfIterations;
result.Evidence = Engine.Infer<Bernoulli>(Evidence);
result.BackgroundLabelProb = Engine.Infer<Dirichlet>(BackgroundLabelProb);
result.WorkerConfusionMatrix = Engine.Infer<Dirichlet[][]>(WorkerConfusionMatrix);
result.TrueLabel = Engine.Infer<Discrete[]>(TrueLabel);
result.TrueLabelConstraint = Engine.Infer<Discrete[]>(TrueLabel, QueryTypes.MarginalDividedByPrior);
// Prediction mode is indicated by none of the workers having a label.
// We can just look at the first one
if (workerLabels[0] == null)
{
result.WorkerPrediction = Engine.Infer<Discrete[][]>(WorkerLabel);
}
return result;
}
/// <summary>
/// Returns the confusion matrix prior of each worker.
/// </summary>
/// <returns>The confusion matrix prior of each worker.</returns>
public Dirichlet[] GetConfusionMatrixPrior()
{
var confusionMatrixPrior = new Dirichlet[LabelCount];
for (int d = 0; d < LabelCount; d++)
{
confusionMatrixPrior[d] = new Dirichlet(Util.ArrayInit(LabelCount, i => i == d ? (InitialWorkerBelief / (1 - InitialWorkerBelief)) * (LabelCount - 1) : 1.0));
}
return confusionMatrixPrior;
}
/// <summary>
/// The BCC posteriors class.
/// </summary>
[Serializable]
public class Posteriors
{
/// <summary>
/// The probabilities that generate the true labels of all the tasks.
/// </summary>
public Dirichlet BackgroundLabelProb;
/// <summary>
/// The probabilities of the true label of each task.
/// </summary>
public Discrete[] TrueLabel;
/// <summary>
/// The Dirichlet parameters of the confusion matrix of each worker.
/// </summary>
public Dirichlet[][] WorkerConfusionMatrix;
/// <summary>
/// The predictive probabilities of the worker's labels.
/// </summary>
public Discrete[][] WorkerPrediction;
/// <summary>
/// The true label constraint used in online training.
/// </summary>
public Discrete[] TrueLabelConstraint;
/// <summary>
/// The model evidence.
/// </summary>
public Bernoulli Evidence;
}
}
}

Просмотреть файл

@ -0,0 +1,438 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using Microsoft.ML.Probabilistic;
using Microsoft.ML.Probabilistic.Algorithms;
using Microsoft.ML.Probabilistic.Factors;
using Microsoft.ML.Probabilistic.Distributions;
using Microsoft.ML.Probabilistic.Math;
using Microsoft.ML.Probabilistic.Models;
using Microsoft.ML.Probabilistic.Utilities;
namespace Crowdsourcing
{
/// <summary>
/// The CommunityBCC model class.
/// </summary>
public class CommunityModel : BCC
{
// Additional ranges
protected Range m;
// Additional variables
protected VariableArray<int> Community;
protected VariableArray<Discrete> CommunityInit;
protected Variable<Vector> CommunityProb;
protected VariableArray<VariableArray<Vector>, Vector[][]> ScoreMatrix;
protected VariableArray<VariableArray<Vector>, Vector[][]> CommunityScoreMatrix;
protected VariableArray<VariableArray<Vector>, Vector[][]> CommunityConfusionMatrix;
protected Variable<PositiveDefiniteMatrix> NoiseMatrix = Variable.New<PositiveDefiniteMatrix>().Named("NoiseMatrix");
// Additional priors
protected VariableArray<Discrete> CommunityConstraint;
protected VariableArray<VariableArray<VectorGaussian>, VectorGaussian[][]> ScoreMatrixConstraint;
protected VariableArray<VariableArray<VectorGaussian>, VectorGaussian[][]> CommunityScoreMatrixPrior;
protected Variable<Dirichlet> CommunityProbPrior;
/// <summary>
/// The noise precision that generates the workers score matrix from the communities score matrix.
/// </summary>
public double NoisePrecision
{
get;
set;
}
/// <summary>
/// The number of communities.
/// </summary>
public int CommunityCount
{
get;
protected set;
}
/// <summary>
/// The mean vector of the Gaussian distribution generating the community score matrices.
/// </summary>
public Tuple<double, double>[] ScoreMeanParameters
{
get;
set;
}
/// <summary>
/// The precision matrix of the Gaussian distribution generating the community score matrices.
/// </summary>
public double[] ScorePrecisionParameters
{
get;
set;
}
/// <summary>
/// The hyperparameter governing community membership.
/// </summary>
public double CommunityPseudoCount
{
get;
set;
}
/// <summary>
/// The prior for the score matrices.
/// </summary>
public VectorGaussian[][] CommunityScoreMatrixPriorObserved
{
get;
protected set;
}
/// <summary>
/// The prior for community membership.
/// </summary>
public Dirichlet CommunityProbPriorObserved
{
get;
protected set;
}
/// <summary>
/// Creates a CBCC model instance.
/// </summary>
public CommunityModel()
: base()
{
NoisePrecision = 5;
CommunityCount = 3;
CommunityPseudoCount = 10.0;
ScoreMeanParameters = null;
ScorePrecisionParameters = null;
}
/// <summary>
/// Initializes the CBCC model.
/// </summary>
/// <param name="taskCount">The number of tasks.</param>
/// <param name="labelCount">The number of labels.</param>
public override void CreateModel(int taskCount, int labelCount)
{
CreateModel(taskCount, labelCount, CommunityCount);
}
/// <summary>
/// Initializes the CBCC model with a number of communities.
/// </summary>
/// <param name="taskCount">The number of tasks.</param>
/// <param name="labelCount">The number of labels.</param>
/// <param name="communityCount">The number of communities.</param>
public virtual void CreateModel(int taskCount, int labelCount, int communityCount)
{
Evidence = Variable<bool>.Random(this.EvidencePrior);
var evidenceBlock = Variable.If(Evidence);
CommunityCount = communityCount;
CommunityProbPriorObserved = Dirichlet.Symmetric(communityCount, CommunityPseudoCount);
DefineVariablesAndRanges(taskCount, labelCount);
DefineGenerativeProcess();
DefineInferenceEngine();
evidenceBlock.CloseBlock();
if (ScoreMeanParameters == null)
{
var scoreMatrixPrior = GetScoreMatrixPrior();
CommunityScoreMatrixPriorObserved = Util.ArrayInit(CommunityCount, comm => Util.ArrayInit(labelCount, lab => new VectorGaussian(scoreMatrixPrior[lab])));
}
else
{
CommunityScoreMatrixPriorObserved = Util.ArrayInit(
CommunityCount,
comm => Util.ArrayInit(
labelCount, lab => VectorGaussian.FromMeanAndPrecision(
Vector.FromArray(
Util.ArrayInit(labelCount, lab1 => lab == lab1 ? ScoreMeanParameters[comm].Item1 : ScoreMeanParameters[comm].Item2)),
PositiveDefiniteMatrix.IdentityScaledBy(LabelCount, ScorePrecisionParameters[comm]))));
}
}
/// <summary>
/// Defines the variables and the ranges of CBCC.
/// </summary>
/// <param name="taskCount">The number of tasks.</param>
/// <param name="labelCount">The number of labels.</param>
protected override void DefineVariablesAndRanges(int taskCount, int labelCount)
{
WorkerCount = Variable.New<int>().Named("WorkerCount");
m = new Range(CommunityCount).Named("m");
n = new Range(taskCount).Named("n");
c = new Range(labelCount).Named("c");
k = new Range(WorkerCount).Named("k");
// The tasks for each worker
WorkerTaskCount = Variable.Array<int>(k).Named("WorkerTaskCount");
kn = new Range(WorkerTaskCount[k]).Named("kn");
WorkerTaskIndex = Variable.Array(Variable.Array<int>(kn), k).Named("WorkerTaskIndex");
WorkerTaskIndex.SetValueRange(n);
WorkerLabel = Variable.Array(Variable.Array<int>(kn), k).Named("WorkerLabel");
// The background probability vector
BackgroundLabelProbPrior = Variable.New<Dirichlet>().Named("BackgroundLabelProbPrior");
BackgroundLabelProb = Variable<Vector>.Random(BackgroundLabelProbPrior).Named("BackgroundLabelProb");
BackgroundLabelProb.SetValueRange(c);
// Community membership
CommunityProbPrior = Variable.New<Dirichlet>().Named("CommunityProbPrior");
CommunityProb = Variable<Vector>.Random(CommunityProbPrior).Named("CommunityProb");
CommunityProb.SetValueRange(m);
Community = Variable.Array<int>(k).Attrib(QueryTypes.Marginal).Attrib(QueryTypes.MarginalDividedByPrior).Named("Community");
CommunityConstraint = Variable.Array<Discrete>(k).Named("CommunityConstraint");
Community[k] = Variable.Discrete(CommunityProb).ForEach(k);
Variable.ConstrainEqualRandom(Community[k], CommunityConstraint[k]);
// Initialiser to break symmetry for community membership
CommunityInit = Variable.Array<Discrete>(k).Named("CommunityInit");
Community[k].InitialiseTo(CommunityInit[k]);
// Community parameters
CommunityScoreMatrixPrior = Variable.Array(Variable.Array<VectorGaussian>(c), m).Named("CommunityScoreMatrixPrior");
CommunityScoreMatrix = Variable.Array(Variable.Array<Vector>(c), m).Named("CommunityScoreMatrix");
CommunityScoreMatrix[m][c] = Variable<Vector>.Random(CommunityScoreMatrixPrior[m][c]);
CommunityConfusionMatrix = Variable.Array(Variable.Array<Vector>(c), m).Named("CommunityConfusionMatrix");
CommunityConfusionMatrix[m][c] = Variable.Softmax(CommunityScoreMatrix[m][c]);
CommunityScoreMatrix.SetValueRange(c);
// Parameters for each worker
ScoreMatrix = Variable.Array(Variable.Array<Vector>(c), k).Attrib(QueryTypes.Marginal).Attrib(QueryTypes.MarginalDividedByPrior).Named("ScoreMatrix");
ScoreMatrixConstraint = Variable.Array(Variable.Array<VectorGaussian>(c), k).Named("ScoreMatrixConstraint");
WorkerConfusionMatrix = Variable.Array(Variable.Array<Vector>(c), k).Named("ConfusionMatrix");
// The unobserved 'true' label for each task
TrueLabel = Variable.Array<int>(n).Attrib(QueryTypes.Marginal).Attrib(QueryTypes.MarginalDividedByPrior).Named("Truth");
TrueLabelConstraint = Variable.Array<Discrete>(n).Named("TruthConstraint");
TrueLabel[n] = Variable.Discrete(BackgroundLabelProb).ForEach(n);
Variable.ConstrainEqualRandom(TrueLabel[n], TrueLabelConstraint[n]);
// The labels given by the workers
WorkerLabel = Variable.Array(Variable.Array<int>(kn), k).Named("WorkerLabel");
}
/// <summary>
/// Defines the generative process of CBCC.
/// </summary>
protected override void DefineGenerativeProcess()
{
// The process that generates the worker's label
using (Variable.ForEach(k))
{
using (Variable.Switch(Community[k]))
{
ScoreMatrix[k][c] = Variable.VectorGaussianFromMeanAndPrecision(CommunityScoreMatrix[Community[k]][c], NoiseMatrix);
}
Variable.ConstrainEqualRandom(ScoreMatrix[k][c], ScoreMatrixConstraint[k][c]);
WorkerConfusionMatrix[k][c] = Variable.Softmax(ScoreMatrix[k][c]);
var trueLabel = Variable.Subarray(TrueLabel, WorkerTaskIndex[k]);
trueLabel.SetValueRange(c);
using (Variable.ForEach(kn))
{
using (Variable.Switch(trueLabel[kn]))
{
WorkerLabel[k][kn] = Variable.Discrete(WorkerConfusionMatrix[k][trueLabel[kn]]);
}
}
}
}
/// <summary>
/// Initializes the CBCC inference engine.
/// </summary>
protected override void DefineInferenceEngine()
{
Engine = new InferenceEngine(new VariationalMessagePassing());
Engine.ShowProgress = false;
Engine.Compiler.UseParallelForLoops = true;
Engine.Compiler.GivePriorityTo(typeof(SoftmaxOp_BL06));
Engine.Compiler.WriteSourceFiles = false;
}
/// <summary>
/// Attachs the data to the workers labels.
/// </summary>
/// <param name="taskIndices">The matrix of the task indices (columns) of each worker (rows).</param>
/// <param name="workerLabels">The matrix of the labels (columns) of each worker (rows).</param>
protected override void AttachData(int[][] taskIndices, int[][] workerLabels)
{
AttachData(taskIndices, workerLabels, null, null);
}
/// <summary>
/// Attachs the data to the workers labels and sets the constraints on the community score matrices and
/// the community memberships (used for online training).
/// </summary>
/// <param name="taskIndices">The matrix of the task indices (columns) of each worker (rows).</param>
/// <param name="workerLabels">The matrix of the labels (columns) of each worker (rows).</param>
/// <param name="scoreConstraint">The constraint of the community score matrices.</param>
/// <param name="communityConstraint">The constraint of the workers community membership.</param>
protected void AttachData(int[][] taskIndices, int[][] workerLabels, VectorGaussian[][] scoreConstraint, Discrete[] communityConstraint)
{
int communityCount = m.SizeAsInt;
int workerCount = workerLabels.Length;
int labelCount = c.SizeAsInt;
base.AttachData(taskIndices, workerLabels);
CommunityInit.ObservedValue = Util.ArrayInit(workerCount, worker => Discrete.PointMass(Rand.Int(communityCount), communityCount));
if (scoreConstraint != null)
{
ScoreMatrixConstraint.ObservedValue = scoreConstraint;
}
else
{
ScoreMatrixConstraint.ObservedValue = Util.ArrayInit(workerCount, w => Util.ArrayInit(labelCount, lab => VectorGaussian.Uniform(labelCount)));
}
if (communityConstraint != null)
{
CommunityConstraint.ObservedValue = communityConstraint;
}
else
{
CommunityConstraint.ObservedValue = Util.ArrayInit(workerCount, w => Discrete.Uniform(communityCount));
}
}
/// <summary>
/// Sets the priors of CBCC.
/// </summary>
/// <param name="workerCount">The number of workers.</param>
/// <param name="priors">The priors.</param>
protected override void SetPriors(int workerCount, BCC.Posteriors priors)
{
int communityCount = m.SizeAsInt;
int labelCount = c.SizeAsInt;
WorkerCount.ObservedValue = workerCount;
NoiseMatrix.ObservedValue = PositiveDefiniteMatrix.IdentityScaledBy(labelCount, NoisePrecision);
CommunityModel.Posteriors cbccPriors = (CommunityModel.Posteriors)priors;
if (cbccPriors == null || cbccPriors.BackgroundLabelProb == null)
BackgroundLabelProbPrior.ObservedValue = Dirichlet.Uniform(labelCount);
else
BackgroundLabelProbPrior.ObservedValue = cbccPriors.BackgroundLabelProb;
if (cbccPriors == null || cbccPriors.CommunityProb == null)
CommunityProbPrior.ObservedValue = CommunityProbPriorObserved;
else
CommunityProbPrior.ObservedValue = cbccPriors.CommunityProb;
if (cbccPriors == null || cbccPriors.CommunityScoreMatrix == null)
CommunityScoreMatrixPrior.ObservedValue = CommunityScoreMatrixPriorObserved;
else
CommunityScoreMatrixPrior.ObservedValue = cbccPriors.CommunityScoreMatrix;
if (cbccPriors == null || cbccPriors.TrueLabelConstraint == null)
TrueLabelConstraint.ObservedValue = Util.ArrayInit(TaskCount, t => Discrete.Uniform(labelCount));
else
TrueLabelConstraint.ObservedValue = cbccPriors.TrueLabelConstraint;
}
/// <summary>
/// Infers the posteriors of CBCC using the attached data.
/// </summary>
/// <param name="taskIndices">The matrix of the task indices (columns) of each worker (rows).</param>
/// <param name="workerLabels">The matrix of the labels (columns) of each worker (rows).</param>
/// <param name="priors">The priors.</param>
/// <returns></returns>
public override BCC.Posteriors Infer(int[][] taskIndices, int[][] workerLabels, BCC.Posteriors priors)
{
var cbccPriors = (CommunityModel.Posteriors)priors;
VectorGaussian[][] scoreConstraint = (cbccPriors == null ? null : cbccPriors.WorkerScoreMatrixConstraint);
Discrete[] communityConstraint = (cbccPriors == null ? null : cbccPriors.WorkerCommunityConstraint);
SetPriors(workerLabels.Length, priors);
AttachData(taskIndices, workerLabels, scoreConstraint, communityConstraint);
var result = new CommunityModel.Posteriors();
Engine.NumberOfIterations = NumberOfIterations;
result.Evidence = Engine.Infer<Bernoulli>(Evidence);
result.BackgroundLabelProb = Engine.Infer<Dirichlet>(BackgroundLabelProb);
result.WorkerConfusionMatrix = Engine.Infer<Dirichlet[][]>(WorkerConfusionMatrix);
result.TrueLabel = Engine.Infer<Discrete[]>(TrueLabel);
result.TrueLabelConstraint = Engine.Infer<Discrete[]>(TrueLabel, QueryTypes.MarginalDividedByPrior);
result.CommunityScoreMatrix = Engine.Infer<VectorGaussian[][]>(CommunityScoreMatrix);
result.CommunityConfusionMatrix = Engine.Infer<Dirichlet[][]>(CommunityConfusionMatrix);
result.WorkerScoreMatrixConstraint = Engine.Infer<VectorGaussian[][]>(ScoreMatrix, QueryTypes.MarginalDividedByPrior);
result.CommunityProb = Engine.Infer<Dirichlet>(CommunityProb);
result.Community = Engine.Infer<Discrete[]>(Community);
result.WorkerCommunityConstraint = Engine.Infer<Discrete[]>(Community, QueryTypes.MarginalDividedByPrior);
return result;
}
/// <summary>
/// Returns the community score matrix prior.
/// </summary>
/// <returns>The community score matrix prior.</returns>
private VectorGaussian[] GetScoreMatrixPrior()
{
var dim = new Range(LabelCount);
var mean = Variable.VectorGaussianFromMeanAndPrecision(Vector.Zero(LabelCount), PositiveDefiniteMatrix.IdentityScaledBy(LabelCount, 1));
var prec = Variable.WishartFromShapeAndRate(1.0, PositiveDefiniteMatrix.IdentityScaledBy(LabelCount, 1));
var score = Variable.VectorGaussianFromMeanAndPrecision(mean, prec);
var confusion = Variable.Softmax(score);
confusion.SetValueRange(dim);
var confusionConstraint = Variable.New<Dirichlet>();
Variable.ConstrainEqualRandom(confusion, confusionConstraint);
var engine = new InferenceEngine(new VariationalMessagePassing())
{
ShowProgress = false
};
engine.Compiler.WriteSourceFiles = false;
var scorePrior = new VectorGaussian[LabelCount];
for (int d = 0; d < LabelCount; d++)
{
confusionConstraint.ObservedValue = new Dirichlet(Util.ArrayInit(LabelCount, i => i == d ? (InitialWorkerBelief / (1 - InitialWorkerBelief)) * (LabelCount - 1) : 1.0));
scorePrior[d] = engine.Infer<VectorGaussian>(score);
}
return scorePrior;
}
/// <summary>
/// CBCC posterior object.
/// </summary>
[Serializable]
public new class Posteriors : BCC.Posteriors
{
/// <summary>
/// The Dirichlet posteriors of the workers community membership.
/// </summary>
public Dirichlet CommunityProb;
/// <summary>
/// The posterior probabilities of the workers community membnerships.
/// </summary>
public Discrete[] Community;
/// <summary>
/// The Dirichlet posteriors of the community confusion matrix.
/// </summary>
public Dirichlet[][] CommunityConfusionMatrix;
/// <summary>
/// The Gaussian posteriors of the community score matrix.
/// </summary>
public VectorGaussian[][] CommunityScoreMatrix;
/// <summary>
/// The Gaussian constraint of the community score matrix (used for online training).
/// </summary>
public VectorGaussian[][] WorkerScoreMatrixConstraint;
/// <summary>
/// Theconstraint of the workers community membership (used for online training).
/// </summary>
public Discrete[] WorkerCommunityConstraint;
}
}
}

Просмотреть файл

@ -0,0 +1,324 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
/* Community-Based Bayesian Aggregation for Crowdsoucing
*
* Software to run the experiment presented in the paper "Community-Based Bayesian Aggregation Models for Crowdsourcing" by Venanzi et. al, WWW14
* To run it, you must create csv file with your data with the format <Worker id, Task id, worker's label, (optional) task's gold label>.
* See CF.csv for an example.
* You can download the original CF data set used in the paper from www.crowdscale.org
*/
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Microsoft.ML.Probabilistic.Math;
using Microsoft.ML.Probabilistic.Utilities;
namespace Crowdsourcing
{
/// <summary>
/// The class for the main program.
/// </summary>
class Crowdsourcing
{
/// <summary>
/// The datasets.
/// </summary>
static string[] GoldDatasets = new string[] { "CF" };
/// <summary>
/// The number of communities of CBCC.
/// </summary>
static int[] NumCommunities = new int[] { 4 };
/// <summary>
/// Flag to run Dawid-Skene (you will also need to link to the Dawid-Skene C# code).
/// </summary>
static bool RunDawidSkene = false;
/// <summary>
/// The results directory.
/// </summary>
static string ResultsDir = @"Results\";
/// <summary>
/// Main method to run the crowdsourcing experiments presented in Venanzi et.al (WWW14).
/// </summary>
/// <param name="args"></param>
static void Main(string[] args)
{
int startIndex = 0;
int endIndex = GoldDatasets.Length - 1;
int whichModel = -1; // Default value to run all the models
Directory.CreateDirectory(ResultsDir);
// Experiment of Figure 5 and Table 2
RunFullGold(startIndex, endIndex);
// Experiment of Figure 4
RunWWWExperiments(startIndex, endIndex, whichModel);
// Experiment to find the number of communities
FindNumCommunities(startIndex, endIndex, 10);
}
/// <summary>
/// Runs the active learning experiment presented in Venanzi et.al (WWW14)
/// for all the models with an array of data sets.
/// </summary>
/// <param name="startIndex">First instance of the data set array.</param>
/// <param name="endIndex">Last instance of the data set array.</param>
/// <param name="whichModel">Model to run.</param>
static void RunWWWExperiments(int startIndex, int endIndex, int whichModel)
{
for (int ds = startIndex; ds <= endIndex; ds++)
{
switch (whichModel)
{
case 1: RunWWWActiveLearning(GoldDatasets[ds], RunType.MajorityVote, TaskSelectionMethod.EntropyTask, null); break;
case 2:
if (RunDawidSkene)
{
RunWWWActiveLearning(GoldDatasets[ds], RunType.DawidSkene, TaskSelectionMethod.EntropyTask, null);
}
break;
case 3: RunWWWActiveLearning(GoldDatasets[ds], RunType.BCC, TaskSelectionMethod.EntropyTask, new BCC()); break;
case 4: RunWWWActiveLearning(GoldDatasets[ds], RunType.CBCC, TaskSelectionMethod.EntropyTask, new CommunityModel(), NumCommunities[ds]); break;
default: // Run all
RunWWWActiveLearning(GoldDatasets[ds], RunType.MajorityVote, TaskSelectionMethod.EntropyTask, null);
if (RunDawidSkene)
{
RunWWWActiveLearning(GoldDatasets[ds], RunType.DawidSkene, TaskSelectionMethod.EntropyTask, null);
}
RunWWWActiveLearning(GoldDatasets[ds], RunType.BCC, TaskSelectionMethod.EntropyTask, new BCC());
RunWWWActiveLearning(GoldDatasets[ds], RunType.CBCC, TaskSelectionMethod.EntropyTask, new CommunityModel(), NumCommunities[ds]);
RunWWWActiveLearning(GoldDatasets[ds], RunType.BCC, TaskSelectionMethod.EntropyTask, new CommunityModel());
break;
}
}
}
/// <summary>
/// Runs the active learning experiment presented in Venanzi et.al (WWW14) on a single data set.
/// </summary>
/// <param name="dataSet">The data.</param>
/// <param name="runType">The model run type.</param>
/// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param>
/// <param name="model">The model instance.</param>
/// <param name="communityCount">The number of communities (only for CBCC).</param>
static void RunWWWActiveLearning(string dataSet, RunType runType, TaskSelectionMethod taskSelectionMethod, BCC model, int communityCount = 4)
{
// Reset the random seed so results can be duplicated for the paper
Rand.Restart(12347);
var workerSelectionMethod = WorkerSelectionMethod.RandomWorker;
var data = Datum.LoadData(@"Data\" + dataSet + ".csv");
string modelName = GetModelName(dataSet, runType, taskSelectionMethod, workerSelectionMethod, communityCount);
ActiveLearning.RunActiveLearning(data, modelName, runType, model, taskSelectionMethod, workerSelectionMethod, ResultsDir, communityCount);
}
/// <summary>
/// Runs all the models on an array of full gold sets.
/// </summary>
/// <param name="startIndex">The first index of the gold set array.</param>
/// <param name="endIndex">The fast index of the gold set array.</param>
static void RunFullGold(int startIndex, int endIndex)
{
Console.Write("RunFullGolds: Running models");
for (int ds = startIndex; ds <= endIndex; ds++)
{
RunGold(GoldDatasets[ds], RunType.MajorityVote, null); Console.Write(".");
if (RunDawidSkene)
{
RunGold(GoldDatasets[ds], RunType.DawidSkene, null);
Console.Write(".");
}
RunGold(GoldDatasets[ds], RunType.BCC, new BCC()); Console.Write(".");
RunGold(GoldDatasets[ds], RunType.CBCC, new CommunityModel(), NumCommunities[ds]); Console.Write(".");
}
Console.Write("done\n");
}
/// <summary>
/// Finds the optimal number of communities
/// </summary>
/// <param name="startIndex">The first index of the gold set array.</param>
/// <param name="endIndex">The fast index of the gold set array.</param>
/// <param name="communityUpperBound">The maximum number of communities</param>
/// ///
static void FindNumCommunities(int startIndex, int endIndex, int communityUpperBound = 10)
{
Console.WriteLine("Find community count: Running models");
var modelEvidence = Util.ArrayInit<double>(communityUpperBound, endIndex + 1, (i, j) => 0.0);
for (int ds = startIndex; ds <= endIndex; ds++)
{
Console.WriteLine("Dataset: " + GoldDatasets[ds]);
for (int communityCount = 1; communityCount <= communityUpperBound; communityCount++)
{
Results results = RunGold(GoldDatasets[ds], RunType.CBCC, new CommunityModel(), communityCount);
modelEvidence[communityCount - 1, ds] = results.ModelEvidence.LogOdds;
Console.WriteLine("Community {0}: {1:0.0000}", communityCount, modelEvidence[communityCount - 1, ds]);
}
}
}
/// <summary>
/// Runs a model with the full gold set.
/// </summary>
/// <param name="dataSet">The data.</param>
/// <param name="runType">The model run type.</param>
/// <param name="model">The model instance.</param>
/// <param name="communityCount">The number of communities (only for CBCC).</param>
/// <returns>The inference results</returns>
static Results RunGold(string dataSet, RunType runType, BCC model, int communityCount = 3)
{
// Reset the random seed so results can be duplicated for the paper
Rand.Restart(12347);
var data = Datum.LoadData(@".\Data\" + dataSet + ".csv");
int totalLabels = data.Count();
string modelName = GetModelName(dataSet, runType, TaskSelectionMethod.EntropyTask, WorkerSelectionMethod.RandomWorker);
Results results = new Results();
switch (runType)
{
case RunType.VoteDistribution:
results.RunMajorityVote(data, true, true);
break;
case RunType.MajorityVote:
results.RunMajorityVote(data, true, false);
break;
case RunType.DawidSkene:
results.RunDawidSkene(data, true);
break;
default:
results.RunBCC(modelName, data, data, model, Results.RunMode.ClearResults, false, communityCount, false, false);
break;
}
// Write the inference results on a csv file
using (StreamWriter writer = new StreamWriter(ResultsDir + "endpoints.csv", true))
{
writer.WriteLine("{0}:,{1:0.000},{2:0.0000}", modelName, results.Accuracy, results.NegativeLogProb);
}
return results;
}
/// <summary>
/// Returns the model name as a string.
/// </summary>
/// <param name="dataset">The name of the data set.</param>
/// <param name="runType">The model run type.</param>
/// <param name="taskSelectionMethod">The method for selecting tasks (Random / Entropy).</param>
/// <param name="workerSelectionMethod">The method for selecting workers (only Random is implemented).</param>
/// <param name="numCommunities">The number of communities (only for CBCC).</param>
/// <returns>The model name</returns>
public static string GetModelName(string dataset, RunType runType, TaskSelectionMethod taskSelectionMethod, WorkerSelectionMethod workerSelectionMethod, int numCommunities = -1)
{
return dataset + "_" + Enum.GetName(typeof(RunType), runType)
+ "_" + Enum.GetName(typeof(TaskSelectionMethod), taskSelectionMethod);
}
}
/// <summary>
/// Options for which model to run.
/// </summary>
public enum RunType
{
/// <summary>
/// The true label distribution
/// as given by the normalised workers' label counts.
/// </summary>
VoteDistribution = 0,
/// <summary>
/// The true label is the majority label.
/// </summary>
MajorityVote = 1,
/// <summary>
/// The Dawid-Skene model.
/// </summary>
DawidSkene = 2,
/// <summary>
/// The BCC model.
/// </summary>
BCC = 3,
/// <summary>
/// The CBCC model.
/// </summary>
CBCC = 4,
}
/// <summary>
/// This class represents a single datum, and has methods to read in data.
/// </summary>
public class Datum
{
/// <summary>
/// The worker id.
/// </summary>
public string WorkerId;
/// <summary>
/// The task id.
/// </summary>
public string TaskId;
/// <summary>
/// The worker's label.
/// </summary>
public int WorkerLabel;
/// <summary>
/// The task's gold label (optional).
/// </summary>
public int? GoldLabel;
/// <summary>
/// Loads the data file in the format (worker id, task id, worker label, ?gold label).
/// </summary>
/// <param name="filename">The data file.</param>
/// <returns>The list of parsed data.</returns>
public static IList<Datum> LoadData(string filename)
{
var result = new List<Datum>();
using (var reader = new StreamReader(filename))
{
string line;
while ((line = reader.ReadLine()) != null)
{
var strarr = line.Split(',');
int length = strarr.Length;
//if (length < 3 || length > 4) //Filter bad entries!!
// continue;
int workerLabel = int.Parse(strarr[2]);
//if (workerLabel < -4 || workerLabel > 4) //Filter bad entries!!
// continue;
var datum = new Datum()
{
WorkerId = strarr[0],
TaskId = strarr[1],
WorkerLabel = workerLabel,
};
if (length == 4)
datum.GoldLabel = int.Parse(strarr[3]);
else
datum.GoldLabel = null;
result.Add(datum);
}
}
return result;
}
}
}

Просмотреть файл

@ -0,0 +1,77 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<GenerateAssemblyInfo>false</GenerateAssemblyInfo>
<OutputType>Exe</OutputType>
<WarningLevel>4</WarningLevel>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<WarningsAsErrors />
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
<DefineConstants>TRACE</DefineConstants>
<Configurations>Debug;Release;DebugFull;DebugCore;ReleaseFull;ReleaseCore</Configurations>
</PropertyGroup>
<Choose>
<When Condition="'$(Configuration)'=='DebugFull' OR '$(Configuration)'=='ReleaseFull'">
<PropertyGroup>
<TargetFramework>net461</TargetFramework>
</PropertyGroup>
</When>
<When Condition="'$(Configuration)'=='DebugCore' OR '$(Configuration)'=='ReleaseCore'">
<PropertyGroup>
<TargetFramework>netcoreapp2.1</TargetFramework>
</PropertyGroup>
</When>
<Otherwise>
<PropertyGroup>
<TargetFrameworks>netcoreapp2.1;net461</TargetFrameworks>
</PropertyGroup>
</Otherwise>
</Choose>
<PropertyGroup Condition=" '$(TargetFramework)' == 'netcoreapp2.1'">
<DefineConstants>$(DefineConstants);NETCORE;NETSTANDARD;NETSTANDARD2_0</DefineConstants>
</PropertyGroup>
<PropertyGroup Condition=" '$(TargetFramework)' == 'net461'">
<DefineConstants>$(DefineConstants);NETFULL</DefineConstants>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU' OR '$(Configuration)|$(Platform)'=='DebugFull|AnyCPU' OR '$(Configuration)|$(Platform)'=='DebugCore|AnyCPU'">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<DefineConstants>$(DefineConstants);DEBUG</DefineConstants>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|AnyCPU' OR '$(Configuration)|$(Platform)'=='ReleaseFull|AnyCPU' OR '$(Configuration)|$(Platform)'=='ReleaseCore|AnyCPU'">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='DebugFull|AnyCPU'">
<PlatformTarget>AnyCPU</PlatformTarget>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='ReleaseFull|AnyCPU'">
<PlatformTarget>AnyCPU</PlatformTarget>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\..\Compiler\Compiler.csproj" />
<ProjectReference Include="..\..\Runtime\Runtime.csproj" />
</ItemGroup>
<ItemGroup>
<Compile Include="..\..\Shared\SharedAssemblyFileVersion.cs" />
<Compile Include="..\..\Shared\SharedAssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
<None Update="Data\CF.csv">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,16 @@
842,79185673,0,0
1258,79185673,0,0
1467,79185673,0,0
1674,79185673,0,0
662,79185673,0,0
708,79185673,0,0
1507,79185673,3,0
1701,79185724,4
38,79185724,3
703,79185724,1
353,79185724,1
165,79185724,0
1025,79185724,4
1638,79185724,4
782,79185900,1
1480,79185900,1
1 842,79185673,0,0
2 1258,79185673,0,0
3 1467,79185673,0,0
4 1674,79185673,0,0
5 662,79185673,0,0
6 708,79185673,0,0
7 1507,79185673,3,0
8 1701,79185724,4
9 38,79185724,3
10 703,79185724,1
11 353,79185724,1
12 165,79185724,0
13 1025,79185724,4
14 1638,79185724,4
15 782,79185900,1
16 1480,79185900,1

Просмотреть файл

@ -0,0 +1,246 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML.Probabilistic.Distributions;
using Microsoft.ML.Probabilistic.Math;
using Microsoft.ML.Probabilistic.Utilities;
namespace Crowdsourcing
{
/// <summary>
/// Data mapping class. This class automatically computes the label range and
/// manages the mapping between the raw data (task, worker ids, and labels) and
/// the model-formatted data (which is in term of indices).
/// </summary>
public class DataMapping
{
/// <summary>
/// The mapping from the worker index to the worker id.
/// </summary>
public string[] WorkerIndexToId;
/// <summary>
/// The mapping from the worker id to the worker index.
/// </summary>
public Dictionary<string, int> WorkerIdToIndex;
/// <summary>
/// The mapping from the community id to the community index.
/// </summary>
public Dictionary<string, int> CommunityIdToIndex;
/// <summary>
/// The mapping from the community index to the community id.
/// </summary>
public string[] CommunityIndexToId;
/// <summary>
/// The mapping from the task index to the task id.
/// </summary>
public string[] TaskIndexToId;
/// <summary>
/// The mapping from the task id to the task index.
/// </summary>
public Dictionary<string, int> TaskIdToIndex;
/// <summary>
/// The lower bound of the labels range.
/// </summary>
public int LabelMin;
/// <summary>
/// The upper bound of the labels range.
/// </summary>
public int LabelMax;
/// <summary>
/// The enumerable list of data.
/// </summary>
public IEnumerable<Datum> Data
{
get;
private set;
}
/// <summary>
/// The number of label values.
/// </summary>
public int LabelCount
{
get
{
return LabelMax - LabelMin + 1;
}
}
/// <summary>
/// The number of workers.
/// </summary>
public int WorkerCount
{
get
{
return WorkerIndexToId.Length;
}
}
/// <summary>
/// The number of tasks.
/// </summary>
public int TaskCount
{
get
{
return TaskIndexToId.Length;
}
}
/// <summary>
/// Creates a data mapping.
/// </summary>
/// <param name="data">The data.</param>
/// <param name="numCommunities">The number of communities.</param>
/// <param name="labelMin">The lower bound of the labels range.</param>
/// <param name="labelMax">The upper bound of the labels range.</param>
public DataMapping(IEnumerable<Datum> data, int numCommunities = -1, int labelMin = int.MaxValue, int labelMax = int.MinValue)
{
WorkerIndexToId = data.Select(d => d.WorkerId).Distinct().ToArray();
WorkerIdToIndex = WorkerIndexToId.Select((id, idx) => new KeyValuePair<string, int>(id, idx)).ToDictionary(x => x.Key, y => y.Value);
TaskIndexToId = data.Select(d => d.TaskId).Distinct().ToArray();
TaskIdToIndex = TaskIndexToId.Select((id, idx) => new KeyValuePair<string, int>(id, idx)).ToDictionary(x => x.Key, y => y.Value);
var labels = data.Select(d => d.WorkerLabel).Distinct().OrderBy(lab => lab).ToArray();
if (labelMin <= labelMax)
{
LabelMin = labelMin;
LabelMax = labelMax;
}
else
{
LabelMin = labels.Min();
LabelMax = labels.Max();
}
Data = data;
if (numCommunities > 0)
{
CommunityIndexToId = Util.ArrayInit(numCommunities, comm => "Community" + comm);
CommunityIdToIndex = CommunityIndexToId.Select((id, idx) => new KeyValuePair<string, int>(id, idx)).ToDictionary(x => x.Key, y => y.Value);
}
}
/// <summary>
/// Returns the matrix of the task indices (columns) of each worker (rows).
/// </summary>
/// <param name="data">The data.</param>
/// <returns>The matrix of the task indices (columns) of each worker (rows).</returns>
public int[][] GetTaskIndicesPerWorkerIndex(IEnumerable<Datum> data)
{
int[][] result = new int[WorkerCount][];
for (int i = 0; i < WorkerCount; i++)
{
var wid = WorkerIndexToId[i];
result[i] = data.Where(d => d.WorkerId == wid).Select(d => TaskIdToIndex[d.TaskId]).ToArray();
}
return result;
}
/// <summary>
/// Returns the matrix of the labels (columns) of each worker (rows).
/// </summary>
/// <param name="data">The data.</param>
/// <returns>The matrix of the labels (columns) of each worker (rows).</returns>
public int[][] GetLabelsPerWorkerIndex(IEnumerable<Datum> data)
{
int[][] result = new int[WorkerCount][];
for (int i = 0; i < WorkerCount; i++)
{
var wid = WorkerIndexToId[i];
result[i] = data.Where(d => d.WorkerId == wid).Select(d => d.WorkerLabel - LabelMin).ToArray();
}
return result;
}
/// <summary>
/// Returns the the gold labels of each task.
/// </summary>
/// <returns>The dictionary keyed by task id and the value is the gold label.</returns>
public Dictionary<string, int?> GetGoldLabelsPerTaskId()
{
// Gold labels that are not consistent are returned as null
// Labels are returned as indexed by task index
return Data.GroupBy(d => d.TaskId).
Select(t => t.GroupBy(d => d.GoldLabel).Where(d => d.Key != null)).
Where(gold_d => gold_d.Count() > 0).
Select(gold_d =>
{
int count = gold_d.Distinct().Count();
var datum = gold_d.First().First();
if (count == 1)
{
var gold = datum.GoldLabel;
if (gold != null)
gold = gold.Value - LabelMin;
return new Tuple<string, int?>(datum.TaskId, gold);
}
else
{
return new Tuple<string, int?>(datum.TaskId, (int?)null);
}
}).ToDictionary(tup => tup.Item1, tup => tup.Item2);
}
/// <summary>
/// For each task, gets the majority vote label if it is unique.
/// </summary>
/// <returns>The list of majority vote labels.</returns>
public int?[] GetMajorityVotesPerTaskIndex()
{
return Data.GroupBy(d => TaskIdToIndex[d.TaskId]).
OrderBy(g => g.Key).
Select(t => t.GroupBy(d => d.WorkerLabel - LabelMin).
Select(g => new { label = g.Key, count = g.Count() })).
Select(arr =>
{
int max = arr.Max(a => a.count);
int[] majorityLabs = arr.Where(a => a.count == max).Select(a => a.label).ToArray();
if (majorityLabs.Length == 1)
return (int?)majorityLabs[0];
else
{
return null;
}
}).ToArray();
}
/// <summary>
/// For each task, gets the empirical label distribution.
/// </summary>
/// <returns></returns>
public Discrete[] GetVoteDistribPerTaskIndex()
{
return Data.GroupBy(d => TaskIdToIndex[d.TaskId]).
OrderBy(g => g.Key).
Select(t => t.GroupBy(d => d.WorkerLabel - LabelMin).
Select(g => new
{
label = g.Key,
count = g.Count()
})).
Select(arr =>
{
Vector v = Vector.Zero(LabelCount);
foreach (var a in arr)
v[a.label] = (double)a.count;
return new Discrete(v);
}).ToArray();
}
}
}

Просмотреть файл

@ -0,0 +1,817 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Microsoft.ML.Probabilistic.Distributions;
using Microsoft.ML.Probabilistic.Math;
using Microsoft.ML.Probabilistic.Utilities;
namespace Crowdsourcing
{
/// <summary>
/// Results class containing posteriors and predictions.
/// </summary>
public class Results
{
/// <summary>
/// The posterior of the true label for each task.
/// </summary>
public Dictionary<string, Discrete> TrueLabel
{
get;
private set;
}
/// <summary>
/// The predicted label for each task when doing simulations from the current
/// model state. It avoids overwriting the true label posterior.
/// </summary>
public Dictionary<string, Discrete> LookAheadTrueLabel
{
get;
private set;
}
/// <summary>
/// The posterior for the constraint that allows online learning for the true label variable.
/// </summary>
public Dictionary<string, Discrete> TrueLabelConstraint
{
get;
private set;
}
/// <summary>
/// The probabilities that generate the true label of all the tasks.
/// </summary>
public Dirichlet BackgroundLabelProb
{
get;
private set;
}
/// <summary>
/// The posterior of the confusion matrix of each worker.
/// </summary>
public Dictionary<string, Dirichlet[]> WorkerConfusionMatrix
{
get;
private set;
}
/// <summary>
/// The look-ahead posterior of the confusion matrix of each worker obtained after simulating
/// a new label in look-ahead run mode.
/// </summary>
public Dictionary<string, Dirichlet[]> LookAheadWorkerConfusionMatrix
{
get;
private set;
}
/// <summary>
/// The predictive probabilities of the labels produced by each worker.
/// </summary>
public Dictionary<string, Dictionary<string, Discrete>> WorkerPrediction
{
get;
private set;
}
/// <summary>
/// The community membership probabilities of each worker.
/// </summary>
public Dictionary<string, Discrete> WorkerCommunity
{
get;
private set;
}
/// <summary>
/// The confusion matrix of each community.
/// </summary>
public Dirichlet[][] CommunityConfusionMatrix
{
get;
private set;
}
/// <summary>
/// The score matrix of each community.
/// </summary>
public VectorGaussian[][] CommunityScoreMatrix
{
get;
private set;
}
/// <summary>
/// The posterior for the constraint that allows online learning for worker confusion matrices
/// int the community model.
/// </summary>
public Dictionary<string, VectorGaussian[]> WorkerScoreMatrixConstraint
{
get;
private set;
}
/// <summary>
/// The probabilities that generate the community memberships of all the workers.
/// </summary>
public Dirichlet CommunityProb
{
get;
private set;
}
/// <summary>
/// The posterior for the constraint that allows online learning for community membership.
/// int the community model.
/// </summary>
public Dictionary<string, Discrete> CommunityConstraint
{
get;
private set;
}
/// <summary>
/// Model evidence.
/// </summary>
public Bernoulli ModelEvidence
{
get;
private set;
}
/// <summary>
/// The data mapping.
/// </summary>
public DataMapping Mapping
{
get;
private set;
}
/// <summary>
/// The gold labels of each task. The gold label type is nullable to
/// support the (usual) situation where the is no labels.
/// </summary>
public Dictionary<string, int?> GoldLabels
{
get;
private set;
}
/// <summary>
/// The accuracy of the current true label predictions.
/// </summary>
public double Accuracy
{
get;
private set;
}
/// <summary>
/// The negative log probability density (NLPD) scores of the current true label predictions.
/// </summary>
public double NegativeLogProb
{
get;
private set;
}
/// <summary>
/// The average recall of the current true label predictions.
/// </summary>
public double AvgRecall
{
get;
private set;
}
/// <summary>
/// The confusion matrix of the predicted true labels against the gold labels
/// The rows are the gold labels and the columns are the predicted labels.
/// </summary>
public double[,] ModelConfusionMatrix
{
get;
private set;
}
/// <summary>
/// Flags whether the model instance is CBCC (true) or BCC (false).
/// </summary>
public bool IsCommunityModel
{
get;
private set;
}
/// <summary>
/// The number of communities.
/// </summary>
public int CommunityCount
{
get;
private set;
}
/// <summary>
/// Runs the majority vote method on the data.
/// </summary>
/// <param name="data">The data</param>
/// <param name="calculateAccuracy">Compute the accuracy (true).</param>
/// <param name="useVoteDistribution">The true label is sampled from the vote distribution (true) or it is
/// taken as the mode of the vote counts (false).
/// In the latter case, ties are broken by sampling from the most voted classes.</param>
/// <returns>The updated results</returns>
public Results RunMajorityVote(IList<Datum> data, bool calculateAccuracy, bool useVoteDistribution)
{
var dataMapping = new DataMapping(data);
Mapping = dataMapping;
GoldLabels = Mapping.GetGoldLabelsPerTaskId();
var inferredLabels = useVoteDistribution ? dataMapping.GetVoteDistribPerTaskIndex() : dataMapping.GetMajorityVotesPerTaskIndex().Select(mv => mv == null ? (Discrete)null : Discrete.PointMass(mv.Value, dataMapping.LabelCount)).ToArray();
TrueLabel = inferredLabels.Select((lab, i) => new
{
key = dataMapping.TaskIndexToId[i],
val = lab
}).ToDictionary(a => a.key, a => a.val);
if (calculateAccuracy)
{
UpdateAccuracy();
}
return this;
}
/// <summary>
/// Run Dawid-Skene on the data.
/// </summary>
/// <param name="data">The data.</param>
/// <param name="calculateAccuracy">Whether to calculate accuracy</param>
/// <returns>A results instance</returns>
public Results RunDawidSkene(IList<Datum> data, bool calculateAccuracy)
{
// If you want to run Dawid-Skene code, download his code, integrate it into
// the project, and change false to true below.
#if false
var dataMapping = new DataMapping(data);
Mapping = dataMapping;
var labelings = data.Select(d => new Labeling(d.WorkerId, d.TaskId, d.WorkerLabel.ToString(), d.GoldLabel.ToString())).ToList();
DawidSkene ds = new DawidSkene(labelings, null, null);
// The labels may be in a different order from our data labeling - we need to create a map.
int[] labelIndexMap = new int[dataMapping.LabelCount];
var dwLabels = ds.classes.Keys.ToArray();
for (int i = 0; i < dataMapping.LabelCount; i++)
{
labelIndexMap[i] = Array.IndexOf(dwLabels, (i + dataMapping.LabelMin).ToString());
}
GoldLabels = Mapping.GetGoldLabelsPerTaskId().
ToDictionary(kvp => kvp.Key, kvp => kvp.Value == null ? (int?)null : (int?)labelIndexMap[kvp.Value.Value]);
ds.Estimate(10);
var inferredLabels = ds.GetObjectClassProbabilities().Select(r => new Discrete(r)).ToArray();
TrueLabel = inferredLabels.Select((lab, i) => new
{
key = dataMapping.TaskIndexToId[i],
val = lab
}).ToDictionary(a => a.key, a => a.val);
if (calculateAccuracy)
{
UpdateAccuracy();
}
return this;
#else
throw new ApplicationException("To support Dawid-Skene, you must link to the C# version of their code");
#endif
}
/// <summary>
/// The different modes in which the model can be run.
/// </summary>
public enum RunMode
{
/// <summary>
/// Clears all posteriors
/// </summary>
ClearResults,
/// <summary>
/// Training from a batch of data - uses initial priors.
/// </summary>
BatchTraining,
/// <summary>
/// Online training from a batch of data - uses previous posteriors as priors.
/// </summary>
OnlineTraining,
/// <summary>
/// Online training where we don't update the posteriors
/// </summary>
LookAheadExperiment,
/// <summary>
/// Use communities as workers in a BCC
/// </summary>
LoadAndUseCommunityPriors,
/// <summary>
/// Prediction of worker labels
/// </summary>
Prediction,
};
/// <summary>
/// The structure with the model parameters.
/// </summary>
[Serializable]
public struct NonTaskWorkerParameters
{
public Dirichlet BackgroundLabelProb;
public Dirichlet CommunityProb;
public VectorGaussian[][] CommunityScoreMatrix;
}
/// <summary>
/// Runs the BCC or CBCC model.
/// </summary>
/// <param name="modelName">The model name.</param>
/// <param name="data">The data that will be used for this run.</param>
/// <param name="fullData">The full data set of data.</param>
/// <param name="model">The model instance (BCC or CBCC).</param>
/// <param name="mode">The mode (for example training, prediction, etc.).</param>
/// <param name="calculateAccuracy">Whether to calculate accuracy.</param>
/// <param name="numCommunities">The number of communities (community model only).</param>
/// <param name="serialize">Whether to serialize all posteriors.</param>
/// <param name="serializeCommunityPosteriors">Whether to serialize community posteriors.</param>
public void RunBCC(string modelName, IList<Datum> data, IList<Datum> fullData, BCC model, RunMode mode, bool calculateAccuracy, int numCommunities = -1, bool serialize = false, bool serializeCommunityPosteriors = false)
{
CommunityModel communityModel = model as CommunityModel;
IsCommunityModel = communityModel != null;
if (this.Mapping == null)
{
this.Mapping = new DataMapping(fullData, numCommunities);
this.GoldLabels = this.Mapping.GetGoldLabelsPerTaskId();
}
/// A new model is created if the label count or the task count has changed
bool createModel = (Mapping.LabelCount != model.LabelCount) || (Mapping.TaskCount != model.TaskCount);
if (IsCommunityModel)
{
/// Creates a new CBCC model instance
CommunityCount = numCommunities;
createModel = createModel || (numCommunities != communityModel.CommunityCount);
if (createModel)
{
communityModel.CreateModel(Mapping.TaskCount, Mapping.LabelCount, numCommunities);
}
}
else if (createModel)
{
/// Creates a new BCC model instance
model.CreateModel(Mapping.TaskCount, Mapping.LabelCount);
}
/// Selects the prior according to the run mode
BCC.Posteriors priors = null;
switch (mode)
{
/// Use existing priors
case RunMode.OnlineTraining:
case RunMode.LookAheadExperiment:
case RunMode.Prediction:
priors = ToPriors();
break;
default:
/// Use default priors
ClearResults();
if (mode == RunMode.LoadAndUseCommunityPriors && IsCommunityModel)
{
priors = DeserializeCommunityPosteriors(modelName, numCommunities);
}
break;
}
/// Get data to observe
var labelsPerWorkerIndex = Mapping.GetLabelsPerWorkerIndex(data);
if (mode == RunMode.Prediction)
{
/// Signal prediction mode by setting all labels to null
labelsPerWorkerIndex = labelsPerWorkerIndex.Select(arr => (int[])null).ToArray();
}
/// Run model inference
BCC.Posteriors posteriors = model.Infer(
Mapping.GetTaskIndicesPerWorkerIndex(data),
labelsPerWorkerIndex, priors);
UpdateResults(posteriors, mode);
/// Compute accuracy
if (calculateAccuracy)
{
UpdateAccuracy();
}
/// Serialize parameters
if (serialize)
{
using (FileStream stream = new FileStream(modelName + ".xml", FileMode.Create))
{
var serializer = new System.Xml.Serialization.XmlSerializer(IsCommunityModel ? typeof(CommunityModel.Posteriors) : typeof(BCC.Posteriors));
serializer.Serialize(stream, posteriors);
}
}
if (serializeCommunityPosteriors && IsCommunityModel)
{
SerializeCommunityPosteriors(modelName);
}
}
/// <summary>
/// Serializes the posteriors on an xml file.
/// </summary>
/// <param name="modelName">The model name.</param>
void SerializeCommunityPosteriors(string modelName)
{
NonTaskWorkerParameters ntwp = new NonTaskWorkerParameters();
ntwp.BackgroundLabelProb = BackgroundLabelProb;
ntwp.CommunityProb = CommunityProb;
ntwp.CommunityScoreMatrix = CommunityScoreMatrix;
using (FileStream stream = new FileStream(modelName + "CommunityPriors.xml", FileMode.Create))
{
var serializer = new System.Xml.Serialization.XmlSerializer(typeof(NonTaskWorkerParameters));
serializer.Serialize(stream, ntwp);
}
}
/// <summary>
/// Deserializes the parameters of CBCC from an xml file (used in the LoadAndUseCommunityPriors mode).
/// </summary>
/// <param name="modelName">The model name.</param>
/// <param name="numCommunities">The number of communities.</param>
/// <returns></returns>
CommunityModel.Posteriors DeserializeCommunityPosteriors(string modelName, int numCommunities)
{
CommunityModel.Posteriors cbccPriors = new CommunityModel.Posteriors();
using (FileStream stream = new FileStream(modelName + "CommunityPriors.xml", FileMode.Open))
{
var serializer = new System.Xml.Serialization.XmlSerializer(typeof(NonTaskWorkerParameters));
var ntwp = (NonTaskWorkerParameters)serializer.Deserialize(stream);
if (ntwp.BackgroundLabelProb.Dimension != Mapping.LabelCount)
{
throw new ApplicationException("Unexpected number of labels");
}
BackgroundLabelProb = ntwp.BackgroundLabelProb;
cbccPriors.BackgroundLabelProb = ntwp.BackgroundLabelProb;
if (ntwp.CommunityScoreMatrix.Length != numCommunities)
{
throw new ApplicationException("Unexpected number of communities");
}
if (ntwp.CommunityScoreMatrix[0][0].Dimension != Mapping.LabelCount)
{
throw new ApplicationException("Unexpected number of labels");
}
CommunityScoreMatrix = ntwp.CommunityScoreMatrix;
cbccPriors.CommunityScoreMatrix = ntwp.CommunityScoreMatrix;
if (ntwp.CommunityProb.Dimension != numCommunities)
{
throw new ApplicationException("Unexpected number of communities");
}
CommunityProb = ntwp.CommunityProb;
cbccPriors.CommunityProb = ntwp.CommunityProb;
}
return cbccPriors;
}
/// <summary>
/// Resets all the parameters to the default values.
/// </summary>
void ClearResults()
{
BackgroundLabelProb = Dirichlet.Uniform(Mapping.LabelCount);
WorkerConfusionMatrix = new Dictionary<string, Dirichlet[]>();
WorkerPrediction = new Dictionary<string, Dictionary<String, Discrete>>();
WorkerCommunity = new Dictionary<string, Discrete>();
TrueLabel = new Dictionary<string, Discrete>();
TrueLabelConstraint = new Dictionary<string, Discrete>();
CommunityConfusionMatrix = null;
WorkerScoreMatrixConstraint = new Dictionary<string, VectorGaussian[]>();
CommunityProb = null;
CommunityScoreMatrix = null;
CommunityConstraint = new Dictionary<string, Discrete>();
LookAheadTrueLabel = new Dictionary<string, Discrete>();
LookAheadWorkerConfusionMatrix = new Dictionary<string, Dirichlet[]>();
ModelEvidence = new Bernoulli(0.5);
}
/// <summary>
/// Updates the results of with the new posteriors.
/// </summary>
/// <param name="posteriors">The posteriors.</param>
/// <param name="mode">The mode (for example training, prediction, etc.).</param>
void UpdateResults(BCC.Posteriors posteriors, RunMode mode)
{
/// In the lookAheadExperiment mode, update only the LookAhead results
if (mode == RunMode.LookAheadExperiment)
{
for (int t = 0; t < posteriors.TrueLabel.Length; t++)
{
LookAheadTrueLabel[Mapping.TaskIndexToId[t]] = posteriors.TrueLabel[t];
}
for (int w = 0; w < posteriors.WorkerConfusionMatrix.Length; w++)
{
LookAheadWorkerConfusionMatrix[Mapping.WorkerIndexToId[w]] = posteriors.WorkerConfusionMatrix[w];
}
}
/// In the prediction mode, update only the worker prediction results
else if (mode == RunMode.Prediction)
{
for (int w = 0; w < posteriors.WorkerConfusionMatrix.Length; w++)
{
WorkerPrediction[Mapping.WorkerIndexToId[w]] = new Dictionary<string, Discrete>();
for (int tw = 0; tw < posteriors.WorkerPrediction[w].Length; tw++)
{
WorkerPrediction[Mapping.WorkerIndexToId[w]][Mapping.TaskIndexToId[tw]] = posteriors.WorkerPrediction[w][tw];
}
}
}
else
{
/// In the all other modes, update all the results
CommunityModel.Posteriors communityPosteriors = posteriors as CommunityModel.Posteriors;
bool isCommunityModel = communityPosteriors != null;
BackgroundLabelProb = posteriors.BackgroundLabelProb;
for (int w = 0; w < posteriors.WorkerConfusionMatrix.Length; w++)
{
WorkerConfusionMatrix[Mapping.WorkerIndexToId[w]] = posteriors.WorkerConfusionMatrix[w];
}
for (int t = 0; t < posteriors.TrueLabel.Length; t++)
{
TrueLabel[Mapping.TaskIndexToId[t]] = posteriors.TrueLabel[t];
}
for (int t = 0; t < posteriors.TrueLabelConstraint.Length; t++)
{
TrueLabelConstraint[Mapping.TaskIndexToId[t]] = posteriors.TrueLabelConstraint[t];
}
if (isCommunityModel)
{
CommunityConfusionMatrix = communityPosteriors.CommunityConfusionMatrix;
for (int w = 0; w < communityPosteriors.WorkerScoreMatrixConstraint.Length; w++)
{
WorkerScoreMatrixConstraint[Mapping.WorkerIndexToId[w]] = communityPosteriors.WorkerScoreMatrixConstraint[w];
CommunityConstraint[Mapping.WorkerIndexToId[w]] = communityPosteriors.WorkerCommunityConstraint[w];
WorkerCommunity[Mapping.WorkerIndexToId[w]] = communityPosteriors.Community[w];
}
CommunityProb = communityPosteriors.CommunityProb;
CommunityScoreMatrix = communityPosteriors.CommunityScoreMatrix;
}
this.ModelEvidence = posteriors.Evidence;
}
}
/// <summary>
/// Loads the priors of BCC and CBCC.
/// </summary>
/// <returns>A BCC posterior instance with the loaded priors.</returns>
BCC.Posteriors ToPriors()
{
int numClasses = Mapping.LabelCount;
int numTasks = Mapping.TaskCount;
int numWorkers = Mapping.WorkerCount;
CommunityModel.Posteriors cbccPriors = new CommunityModel.Posteriors();
BCC.Posteriors priors = IsCommunityModel ? cbccPriors : new BCC.Posteriors();
/// Loads the prior of the background probabilities of the tasks
priors.BackgroundLabelProb = BackgroundLabelProb;
/// Loads the prior of the confusion matrix of each worker
priors.WorkerConfusionMatrix = Util.ArrayInit(numWorkers,
w =>
{
string wid = Mapping.WorkerIndexToId[w];
if (WorkerConfusionMatrix.ContainsKey(wid))
return Util.ArrayInit(numClasses, lab => WorkerConfusionMatrix[wid][lab]);
else
return Util.ArrayInit(numClasses, lab => Dirichlet.Uniform(numClasses));
});
/// Loads the true label constraint of each task
priors.TrueLabelConstraint = Util.ArrayInit(numTasks,
t =>
{
string tid = Mapping.TaskIndexToId[t];
if (TrueLabelConstraint.ContainsKey(tid))
return TrueLabelConstraint[Mapping.TaskIndexToId[t]];
else
return Discrete.Uniform(numClasses);
});
/// Loads the priors of the parameters of CBCC
if (IsCommunityModel)
{
cbccPriors.CommunityConfusionMatrix = CommunityConfusionMatrix;
cbccPriors.WorkerScoreMatrixConstraint = Util.ArrayInit(numWorkers,
w =>
{
string wid = Mapping.WorkerIndexToId[w];
if (WorkerScoreMatrixConstraint.ContainsKey(wid))
return Util.ArrayInit(numClasses, lab => WorkerScoreMatrixConstraint[wid][lab]);
else
return Util.ArrayInit(numClasses, lab => VectorGaussian.Uniform(numClasses));
});
cbccPriors.CommunityProb = CommunityProb;
cbccPriors.CommunityScoreMatrix = CommunityScoreMatrix;
cbccPriors.WorkerCommunityConstraint = Util.ArrayInit(numWorkers,
w =>
{
string wid = Mapping.WorkerIndexToId[w];
if (CommunityConstraint.ContainsKey(wid))
return CommunityConstraint[wid];
else
return Discrete.Uniform(CommunityCount);
});
}
priors.Evidence = ModelEvidence;
return priors;
}
/// <summary>
/// Updates the accuracy using the current results.
/// </summary>
private void UpdateAccuracy()
{
double nlpdThreshold = -Math.Log(0.001);
int labelCount = TrueLabel.Where(kvp => kvp.Value != null).First().Value.Dimension;
var confusionMatrix = Util.ArrayInit(labelCount, labelCount, (i, j) => 0.0);
int correct = 0;
double logProb = 0.0;
int goldX = 0;
foreach (var kvp in GoldLabels)
{
if (kvp.Value == null)
continue;
// We have a gold label
goldX++;
var trueLabel = TrueLabel[kvp.Key];
if (trueLabel == null)
continue; // No inferred label
var probs = trueLabel.GetProbs();
var max = probs.Max();
var predictedLabels = probs.Select((p, i) => new
{
prob = p,
idx = i
}).Where(a => a.prob == max).Select(a => a.idx).ToArray();
int predictedLabel = predictedLabels.Length == 1 ? predictedLabels[0] : predictedLabels[Rand.Int(predictedLabels.Length)];
int goldLabel = kvp.Value.Value;
confusionMatrix[goldLabel, predictedLabel] = confusionMatrix[goldLabel, predictedLabel] + 1.0;
if (goldLabel == predictedLabel)
correct++;
var nlp = -trueLabel.GetLogProb(goldLabel);
if (nlp > nlpdThreshold)
nlp = nlpdThreshold;
logProb += nlp;
}
Accuracy = correct / (double)goldX;
NegativeLogProb = logProb / (double)goldX;
ModelConfusionMatrix = confusionMatrix;
// Compute average recall
double sumRec = 0;
for (int i = 0; i < labelCount; i++)
{
double classSum = 0;
for (int j = 0; j < labelCount; j++)
{
classSum += confusionMatrix[i, j];
}
sumRec += confusionMatrix[i, i] / classSum;
}
AvgRecall = sumRec / labelCount;
}
/// <summary>
/// Writes out the mean of an uncertain confusion matrix to a StreamWriter.
/// The confusion matrix is passed as an of Dirichlets, one for each row
/// of the confusion matrix (as given by the posteriors from the model).
/// </summary>
/// <param name="writer">A| StreamWriter instance.</param>
/// <param name="worker">The worker id.</param>
/// <param name="confusionMatrix">The confusion matrix</param>
private static void WriteConfusionMatrix(StreamWriter writer, string worker, Dirichlet[] confusionMatrix)
{
int labelCount = confusionMatrix.Length;
var meanConfusionMatrix = confusionMatrix.Select(cm => cm.GetMean()).ToArray();
var printableConfusionMatrix = Util.ArrayInit(labelCount, labelCount, (i, j) => meanConfusionMatrix[i][j]);
WriteWorkerConfusionMatrix(writer, worker, printableConfusionMatrix);
}
/// <summary>
/// Writes the a confusion matrix to a stream writer.
/// </summary>
/// <param name="writer">A StreamWriter instance.</param>
/// <param name="worker">The worker id.</param>
/// <param name="confusionMatrix">The confusion matrix.</param>
private static void WriteWorkerConfusionMatrix(StreamWriter writer, string worker, double[,] confusionMatrix)
{
int labelCount = confusionMatrix.GetLength(0);
writer.WriteLine(worker);
for (int j = 0; j < labelCount; j++)
writer.Write(",{0}", j);
writer.WriteLine();
for (int i = 0; i < labelCount; i++)
{
writer.Write(i);
for (int j = 0; j < labelCount; j++)
writer.Write(",{0:0.0000}", confusionMatrix[i, j]);
writer.WriteLine();
}
}
/// <summary>
/// Writes various results to a StreamWriter.
/// </summary>
/// <param name="writer">A StreamWriter instance.</param>
/// <param name="writeCommunityParameters">Set true to write community parameters.</param>
/// <param name="writeWorkerParameters">Set true to write worker parameters.</param>
/// <param name="writeWorkerCommunities">Set true to write worker communities.</param>
public void WriteResults(StreamWriter writer, bool writeCommunityParameters, bool writeWorkerParameters, bool writeWorkerCommunities)
{
this.WriteAccuracy(writer);
if (writeCommunityParameters && this.CommunityConfusionMatrix != null)
{
for (int communityIndex = 0; communityIndex < this.CommunityConfusionMatrix.Length; communityIndex++)
{
WriteConfusionMatrix(writer, "Community" + communityIndex, this.CommunityConfusionMatrix[communityIndex]);
}
}
if (writeWorkerParameters && this.WorkerConfusionMatrix != null)
{
foreach (var kvp in this.WorkerConfusionMatrix)
{
WriteConfusionMatrix(writer, kvp.Key, kvp.Value);
}
}
if (writeWorkerCommunities && this.WorkerCommunity != null)
{
foreach (var kvp in this.WorkerCommunity)
{
writer.WriteLine(string.Format("{0}:\t{1}", kvp.Key, kvp.Value));
}
}
writer.WriteLine("Log Evidence = {0:0.0000}", ModelEvidence.LogOdds);
}
/// <summary>
/// Writes the accuracy results on the StreamWriter.
/// </summary>
/// <param name="writer">The StreamWriter.</param>
public void WriteAccuracy(StreamWriter writer)
{
writer.WriteLine("Accuracy = {0:0.000}", this.Accuracy);
writer.WriteLine("Mean negative log prob density = {0:0.000}", this.NegativeLogProb);
WriteWorkerConfusionMatrix(writer, "Model confusion matrix", this.ModelConfusionMatrix);
}
}
}

Просмотреть файл

@ -0,0 +1,113 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using Microsoft.ML.Probabilistic.Distributions;
using Microsoft.ML.Probabilistic.Math;
using Microsoft.ML.Probabilistic.Models;
using Microsoft.ML.Probabilistic.Utilities;
using System;
namespace CrowdsourcingWithWords
{
/// <summary>
/// The BCC model class.
/// </summary>
public class BCC
{
/// <summary>
/// The number of label values.
/// </summary>
public int LabelCount => c?.SizeAsInt ?? 0;
/// <summary>
/// The number of tasks.
/// </summary>
public int TaskCount => n?.SizeAsInt ?? 0;
// Ranges
protected Range n;
protected Range k;
protected Range c;
protected Range kn;
// Variables in the model
protected Variable<int> WorkerCount;
protected VariableArray<int> TrueLabel;
protected VariableArray<int> WorkerTaskCount;
protected VariableArray<VariableArray<int>, int[][]> WorkerTaskIndex;
protected VariableArray<VariableArray<int>, int[][]> WorkerLabel;
protected Variable<Vector> BackgroundLabelProb;
protected VariableArray<VariableArray<Vector>, Vector[][]> WorkerConfusionMatrix;
protected Variable<bool> Evidence;
// Prior distributions
protected Variable<Dirichlet> BackgroundLabelProbPrior;
protected VariableArray<VariableArray<Dirichlet>, Dirichlet[][]> ConfusionMatrixPrior;
protected VariableArray<Discrete> TrueLabelConstraint;
protected Variable<Bernoulli> EvidencePrior;
// Inference engine
protected InferenceEngine Engine;
// Hyperparameters and inference settings
public double InitialWorkerBelief
{
get;
set;
}
/// <summary>
/// Returns the confusion matrix prior of each worker.
/// </summary>
/// <returns>The confusion matrix prior of each worker.</returns>
public Dirichlet[] GetConfusionMatrixPrior()
{
var confusionMatrixPrior = new Dirichlet[LabelCount];
for (int d = 0; d < LabelCount; d++)
{
confusionMatrixPrior[d] = new Dirichlet(Util.ArrayInit(LabelCount, i => i == d ? (InitialWorkerBelief / (1 - InitialWorkerBelief)) * (LabelCount - 1) : 1.0));
}
return confusionMatrixPrior;
}
}
/// <summary>
/// The BCC posteriors class.
/// </summary>
[Serializable]
public class BCCPosteriors
{
/// <summary>
/// The probabilities that generate the true labels of all the tasks.
/// </summary>
public Dirichlet BackgroundLabelProb;
/// <summary>
/// The probabilities of the true label of each task.
/// </summary>
public Discrete[] TrueLabel;
/// <summary>
/// The Dirichlet parameters of the confusion matrix of each worker.
/// </summary>
public Dirichlet[][] WorkerConfusionMatrix;
/// <summary>
/// The predictive probabilities of the worker's labels.
/// </summary>
public Discrete[][] WorkerPrediction;
/// <summary>
/// The true label constraint used in online training.
/// </summary>
public Discrete[] TrueLabelConstraint;
/// <summary>
/// The model evidence.
/// </summary>
public Bernoulli Evidence;
}
}

Просмотреть файл

@ -0,0 +1,181 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using Microsoft.ML.Probabilistic.Algorithms;
using Microsoft.ML.Probabilistic.Distributions;
using Microsoft.ML.Probabilistic.Math;
using Microsoft.ML.Probabilistic.Models;
using Microsoft.ML.Probabilistic.Utilities;
using System;
using System.Linq;
namespace CrowdsourcingWithWords
{
/// <summary>
/// The BCCWords model
/// </summary>
public class BCCWords : BCC
{
// Add extra ranges
private Range w;
private Range nw;
// Model evidence
private Variable<bool> evidence;
// Additional variables for BCCWords
private VariableArray<Vector> ProbWord;
private VariableArray<int> WordCount;
private VariableArray<VariableArray<int>, int[][]> Words;
private Variable<Dirichlet> ProbWordPrior;
public void CreateModel(int NumTasks, int NumClasses, int VocabSize, int numBatches = 3)
{
WorkerCount = Variable.New<int>().Named("WorkerCount");
// Set up inference engine
Engine = new InferenceEngine(new VariationalMessagePassing());
// Set engine flags
Engine.Compiler.WriteSourceFiles = true;
Engine.Compiler.UseParallelForLoops = true;
evidence = Variable.Bernoulli(0.5).Named("evidence");
IfBlock block = Variable.If(evidence);
// Set up ranges
n = new Range(NumTasks).Named("N");
c = new Range(NumClasses).Named("C");
k = new Range(WorkerCount).Named("K");
WorkerTaskCount = Variable.Array<int>(k).Named("WorkerTaskCount");
kn = new Range(WorkerTaskCount[k]).Named("KN");
WorkerTaskIndex = Variable.Array(Variable.Array<int>(kn), k).Named("Task");
WorkerTaskIndex.SetValueRange(n);
// Initialise truth
BackgroundLabelProbPrior = Variable.New<Dirichlet>().Named("TruthProbPrior");
BackgroundLabelProb = Variable<Vector>.Random(BackgroundLabelProbPrior).Named("TruthProb");
BackgroundLabelProb.SetValueRange(c);
// Truth distributions
TrueLabel = Variable.Array<int>(n).Named("Truth");
TrueLabel[n] = Variable.Discrete(BackgroundLabelProb).ForEach(n);
//VocabSize = Variable.New<int>();
w = new Range(VocabSize).Named("W");
ProbWord = Variable.Array<Vector>(c).Named("ProbWord");
ProbWord.SetValueRange(w);
WordCount = Variable.Array<int>(n).Named("WordCount");
nw = new Range(WordCount[n]).Named("WN");
Words = Variable.Array(Variable.Array<int>(nw), n).Named("Word");
ProbWordPrior = Variable.New<Dirichlet>().Named("ProbWordPrior");
ProbWord[c] = Variable<Vector>.Random(ProbWordPrior).ForEach(c);
// Initialise user profiles
ConfusionMatrixPrior = Variable.Array(Variable.Array<Dirichlet>(c), k).Named("WorkerConfusionMatrixPrior");
WorkerConfusionMatrix = Variable.Array(Variable.Array<Vector>(c), k).Named("WorkerConfusionMatrix");
WorkerConfusionMatrix[k][c] = Variable<Vector>.Random(ConfusionMatrixPrior[k][c]);
WorkerConfusionMatrix.SetValueRange(c);
// Vote distributions
WorkerLabel = Variable.Array(Variable.Array<int>(kn), k).Named("WorkerLabel");
using (Variable.ForEach(k))
{
var trueLabel = Variable.Subarray(TrueLabel, WorkerTaskIndex[k]).Named("TrueLabelSubarray");
trueLabel.SetValueRange(c);
using (Variable.ForEach(kn))
{
using (Variable.Switch(trueLabel[kn]))
{
WorkerLabel[k][kn] = Variable.Discrete(WorkerConfusionMatrix[k][trueLabel[kn]]);
}
}
}
// Words inference
using (Variable.ForEach(n))
{
using (Variable.Switch(TrueLabel[n]))
{
Words[n][nw] = Variable.Discrete(ProbWord[TrueLabel[n]]).ForEach(nw);
}
}
block.CloseBlock();
}
private void ObserveCrowdLabels(int[][] workerLabel, int[][] workerTaskIndex)
{
BackgroundLabelProbPrior.ObservedValue = Dirichlet.Uniform(c.SizeAsInt);
WorkerCount.ObservedValue = workerLabel.Length;
WorkerLabel.ObservedValue = workerLabel;
WorkerTaskCount.ObservedValue = workerTaskIndex.Select(tasks => tasks.Length).ToArray();
WorkerTaskIndex.ObservedValue = workerTaskIndex;
SetBiasedPriors(WorkerCount.ObservedValue);
}
private void ObserveWords(int[][] words, int[] wordCounts)
{
Words.ObservedValue = words;
WordCount.ObservedValue = wordCounts;
}
private void ObserveTrueLabels(int[] trueLabels)
{
TrueLabel.ObservedValue = trueLabels;
}
public void SetBiasedPriors(int workerCount)
{
// uniform over true values
BackgroundLabelProbPrior.ObservedValue = Dirichlet.Uniform(c.SizeAsInt);
ConfusionMatrixPrior.ObservedValue = Util.ArrayInit(workerCount, input => Util.ArrayInit(c.SizeAsInt, l => new Dirichlet(Util.ArrayInit(c.SizeAsInt, l1 => l1 == l ? 5.5 : 1))));
ProbWordPrior.ObservedValue = Dirichlet.Symmetric(w.SizeAsInt, 1);
}
/* Inference */
public BCCWordsPosteriors InferPosteriors(
int[][] workerLabel, int[][] workerTaskIndex, int[][] words, int[] wordCounts, int[] trueLabels = null,
int numIterations = 35)
{
ObserveCrowdLabels(workerLabel, workerTaskIndex);
ObserveWords(words, wordCounts);
if (trueLabels != null)
{
ObserveTrueLabels(trueLabels);
}
BCCWordsPosteriors posteriors = new BCCWordsPosteriors();
Console.WriteLine("\n***** BCC Words *****\n");
for (int it = 1; it <= numIterations; it++)
{
Engine.NumberOfIterations = it;
posteriors.TrueLabel = Engine.Infer<Discrete[]>(TrueLabel);
posteriors.WorkerConfusionMatrix = Engine.Infer<Dirichlet[][]>(WorkerConfusionMatrix);
posteriors.BackgroundLabelProb = Engine.Infer<Dirichlet>(BackgroundLabelProb);
posteriors.ProbWordPosterior = Engine.Infer<Dirichlet[]>(ProbWord);
Console.WriteLine("Iteration {0}:\t{1:0.0000}", it, posteriors.TrueLabel[0]);
}
posteriors.Evidence = Engine.Infer<Bernoulli>(evidence);
return posteriors;
}
}
/// <summary>
/// BCCWords posterior object.
/// </summary>
[Serializable]
public class BCCWordsPosteriors : BCCPosteriors
{
/// <summary>
/// The Dirichlet posteriors of the word probabilities for each true label value.
/// </summary>
public Dirichlet[] ProbWordPosterior;
}
}

Просмотреть файл

@ -0,0 +1,118 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
/* Language Understanding in the Wild: Combining Crowdsourcing and Machine Learning
*
* Software to run the experiment presented in the paper "Language Understanding in the Wind: Combining Crowdsourcing and Machine Learning" by Simpson et. al, WWW15
* To run it on your data:
* - Replace Data/labels.tsv with tab-separated fields <WorkerId, TaskId, Worker label, Text, Gold label (optional)>
* - Replace Data/stopwords.txt with the list of stop words, one for each line
*/
namespace CrowdsourcingWithWords
{
using System;
using System.Collections.Generic;
using System.IO;
class CrowdsourcingWithWords
{
/// <summary>
/// Main method to run the crowdsourcing experiments presented in Simpson et.al (WWW15).
/// </summary>
public static void Main()
{
var data = Datum.LoadData(Path.Combine("Data", "labels.tsv"));
// Run model and get results
var VocabularyOnSubData = ResultsWords.BuildVocabularyOnSubdata((List<Datum>)data);
BCCWords model = new BCCWords();
ResultsWords resultsWords = new ResultsWords(data, VocabularyOnSubData);
DataMappingWords mapping = resultsWords.Mapping as DataMappingWords;
if (mapping != null)
{
resultsWords = new ResultsWords(data, VocabularyOnSubData);
resultsWords.RunBCCWords("BCCwords", data, data, model, Results.RunMode.ClearResults, true);
}
using (var writer = new StreamWriter(Console.OpenStandardOutput()))
{
resultsWords.WriteResults(writer, false, false, false, true);
}
Console.WriteLine("Done. Press enter to exit.");
Console.ReadLine();
}
}
/// <summary>
/// This class represents a single datum, and has methods to read in data.
/// </summary>
public class Datum
{
/// <summary>
/// The worker id.
/// </summary>
public string WorkerId;
/// <summary>
/// The task id.
/// </summary>
public string TaskId;
/// <summary>
/// The worker's label.
/// </summary>
public int WorkerLabel;
/// <summary>
/// The task's gold label (optional).
/// </summary>
public int? GoldLabel;
/// <summary>
/// The body text of the document (optional - only for text sentiment labelling tasks).
/// </summary>
public string BodyText;
/// <summary>
/// Loads the data file in the format (worker id, task id, worker label, ?gold label).
/// </summary>
/// <param name="filename">The data file.</param>
/// <param name="maxLength"></param>
/// <returns>The list of parsed data.</returns>
public static IList<Datum> LoadData(string filename, int maxLength = short.MaxValue)
{
var result = new List<Datum>();
using (var reader = new StreamReader(filename))
{
string line;
while ((line = reader.ReadLine()) != null && result.Count < maxLength)
{
var strarr = line.Split('\t');
int length = strarr.Length;
var datum = new Datum
{
WorkerId = strarr[0],
TaskId = strarr[1],
WorkerLabel = int.Parse(strarr[2]),
BodyText = strarr[3]
};
if (length >= 5)
datum.GoldLabel = int.Parse(strarr[4]);
else
datum.GoldLabel = null;
result.Add(datum);
}
}
return result;
}
}
}

Просмотреть файл

@ -0,0 +1,78 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<GenerateAssemblyInfo>false</GenerateAssemblyInfo>
<OutputType>Exe</OutputType>
<WarningLevel>4</WarningLevel>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<WarningsAsErrors />
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>false</Prefer32Bit>
<DefineConstants>TRACE</DefineConstants>
<Configurations>Debug;Release;DebugFull;DebugCore;ReleaseFull;ReleaseCore</Configurations>
</PropertyGroup>
<Choose>
<When Condition="'$(Configuration)'=='DebugFull' OR '$(Configuration)'=='ReleaseFull'">
<PropertyGroup>
<TargetFramework>net461</TargetFramework>
</PropertyGroup>
</When>
<When Condition="'$(Configuration)'=='DebugCore' OR '$(Configuration)'=='ReleaseCore'">
<PropertyGroup>
<TargetFramework>netcoreapp2.1</TargetFramework>
</PropertyGroup>
</When>
<Otherwise>
<PropertyGroup>
<TargetFrameworks>netcoreapp2.1;net461</TargetFrameworks>
</PropertyGroup>
</Otherwise>
</Choose>
<PropertyGroup Condition=" '$(TargetFramework)' == 'netcoreapp2.1'">
<DefineConstants>$(DefineConstants);NETCORE;NETSTANDARD;NETSTANDARD2_0</DefineConstants>
</PropertyGroup>
<PropertyGroup Condition=" '$(TargetFramework)' == 'net461'">
<DefineConstants>$(DefineConstants);NETFULL</DefineConstants>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU' OR '$(Configuration)|$(Platform)'=='DebugFull|AnyCPU' OR '$(Configuration)|$(Platform)'=='DebugCore|AnyCPU'">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<DefineConstants>$(DefineConstants);DEBUG</DefineConstants>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|AnyCPU' OR '$(Configuration)|$(Platform)'=='ReleaseFull|AnyCPU' OR '$(Configuration)|$(Platform)'=='ReleaseCore|AnyCPU'">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='DebugFull|AnyCPU'">
<PlatformTarget>AnyCPU</PlatformTarget>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='ReleaseFull|AnyCPU'">
<PlatformTarget>AnyCPU</PlatformTarget>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\..\Compiler\Compiler.csproj" />
<ProjectReference Include="..\..\Runtime\Runtime.csproj" />
</ItemGroup>
<ItemGroup>
<Compile Include="..\..\Shared\SharedAssemblyFileVersion.cs" />
<Compile Include="..\..\Shared\SharedAssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
<None Update="Data\labels.tsv">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="Data\stopwords.txt">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,3 @@
worker1 task1 1 Some interesting text 2
worker2 task1 2 Some interesting text 2
worker3 task1 2 Some interesting text 2
1 worker1 task1 1 Some interesting text 2
2 worker2 task1 2 Some interesting text 2
3 worker3 task1 2 Some interesting text 2

Просмотреть файл

@ -0,0 +1,7 @@
the
is
at
on
in
and
a

Просмотреть файл

@ -0,0 +1,357 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using Microsoft.ML.Probabilistic.Math;
using Microsoft.ML.Probabilistic.Utilities;
using System;
using System.Collections.Generic;
using System.Linq;
namespace CrowdsourcingWithWords
{
/// <summary>
/// Data mapping class. This class manages the mapping between the data (which is
/// in the form of task, worker ids, and labels) and the model data (which is in term of indices).
/// </summary>
public class DataMapping
{
/// <summary>
/// The mapping from the worker index to the worker id.
/// </summary>
public string[] WorkerIndexToId;
/// <summary>
/// The mapping from the worker id to the worker index.
/// </summary>
public Dictionary<string, int> WorkerIdToIndex;
/// <summary>
/// The mapping from the community id to the community index.
/// </summary>
public Dictionary<string, int> CommunityIdToIndex;
/// <summary>
/// The mapping from the community index to the community id.
/// </summary>
public string[] CommunityIndexToId;
/// <summary>
/// The mapping from the task index to the task id.
/// </summary>
public string[] TaskIndexToId;
/// <summary>
/// The mapping from the task id to the task index.
/// </summary>
public Dictionary<string, int> TaskIdToIndex;
/// <summary>
/// The lower bound of the labels range.
/// </summary>
public int LabelMin;
/// <summary>
/// The upper bound of the labels range.
/// </summary>
public int LabelMax;
/// <summary>
/// The enumerable list of data.
/// </summary>
public IEnumerable<Datum> Data
{
get;
}
/// <summary>
/// The filtered enumerable list of data with gold labels.
/// </summary>
public IEnumerable<Datum> DataWithGold
{
get;
}
/// <summary>
/// The number of label values.
/// </summary>
public int LabelCount => LabelMax - LabelMin + 1;
/// <summary>
/// The number of workers.
/// </summary>
public int WorkerCount => WorkerIndexToId.Length;
/// <summary>
/// The number of tasks.
/// </summary>
public int TaskCount => TaskIndexToId.Length;
/// <summary>
/// Creates a data mapping.
/// </summary>
/// <param name="data">The data.</param>
/// <param name="numCommunities">The number of communities.</param>
/// <param name="labelMin">The lower bound of the labels range.</param>
/// <param name="labelMax">The upper bound of the labels range.</param>
public DataMapping(IEnumerable<Datum> data, int numCommunities = -1, int labelMin = int.MaxValue, int labelMax = int.MinValue)
{
WorkerIndexToId = data.Select(d => d.WorkerId).Distinct().ToArray();
WorkerIdToIndex = WorkerIndexToId.Select((id, idx) => new KeyValuePair<string, int>(id, idx)).ToDictionary(x => x.Key, y => y.Value);
TaskIndexToId = data.Select(d => d.TaskId).Distinct().ToArray();
TaskIdToIndex = TaskIndexToId.Select((id, idx) => new KeyValuePair<string, int>(id, idx)).ToDictionary(x => x.Key, y => y.Value);
var labels = data.SelectMany(d => d.GoldLabel.HasValue ? new[] { d.GoldLabel.Value, d.WorkerLabel } : new[] { d.WorkerLabel })
.Distinct().OrderBy(lab => lab).ToArray();
if (labelMin <= labelMax)
{
LabelMin = labelMin;
LabelMax = labelMax;
}
else
{
LabelMin = labels.Min();
LabelMax = labels.Max();
}
Data = data;
if (numCommunities > 0)
{
CommunityIndexToId = Util.ArrayInit(numCommunities, comm => "Community" + comm);
CommunityIdToIndex = CommunityIndexToId.Select((id, idx) => new KeyValuePair<string, int>(id, idx)).ToDictionary(x => x.Key, y => y.Value);
}
DataWithGold = data.Where(d => d.GoldLabel.HasValue);
}
/// <summary>
/// Returns the matrix of the task indices (columns) of each worker (rows).
/// </summary>
/// <param name="data">The data.</param>
/// <returns>The matrix of the task indices (columns) of each worker (rows).</returns>
public int[][] GetTaskIndicesPerWorkerIndex(IEnumerable<Datum> data)
{
int[][] result = new int[WorkerCount][];
for (int i = 0; i < WorkerCount; i++)
{
var wid = WorkerIndexToId[i];
result[i] = data.Where(d => d.WorkerId == wid).Select(d => TaskIdToIndex[d.TaskId]).ToArray();
}
return result;
}
/// <summary>
/// Returns the matrix of the labels (columns) of each worker (rows).
/// </summary>
/// <param name="data">The data.</param>
/// <returns>The matrix of the labels (columns) of each worker (rows).</returns>
public int[][] GetLabelsPerWorkerIndex(IEnumerable<Datum> data)
{
int[][] result = new int[WorkerCount][];
for (int i = 0; i < WorkerCount; i++)
{
var wid = WorkerIndexToId[i];
result[i] = data.Where(d => d.WorkerId == wid).Select(d => d.WorkerLabel - LabelMin).ToArray();
}
return result;
}
/// <summary>
/// Returns the the gold labels of each task.
/// </summary>
/// <returns>The dictionary keyed by task id and the value is the gold label.</returns>
public Dictionary<string, int?> GetGoldLabelsPerTaskId()
{
// Gold labels that are not consistent are returned as null
// Labels are returned as indexed by task index
return Data.GroupBy(d => d.TaskId).
Select(t => t.GroupBy(d => d.GoldLabel).Where(d => d.Key.HasValue)).
Where(gold_d => gold_d.Count() > 0).
Select(gold_d =>
{
int count = gold_d.Distinct().Count();
var datum = gold_d.First().First();
if (count == 1)
{
var gold = datum.GoldLabel;
if (gold != null)
gold = gold.Value - LabelMin;
return new Tuple<string, int?>(datum.TaskId, gold);
}
else if (count > 1)
{
throw new Exception($"Conflicting gold labels for task {gold_d.First().First().TaskId}");
}
else
{
return new Tuple<string, int?>(datum.TaskId, null);
}
}).ToDictionary(tup => tup.Item1, tup => tup.Item2);
}
public Dictionary<string, int?> GetRandomLabelPerTaskId(IList<Datum> data)
{
// Labels are returned as indexed by task index
return data.GroupBy(d => d.TaskId).
Select(collection =>
{
int r = Rand.Int(0, collection.Count() - 1);
return new Tuple<string, int?>(collection.Key, collection.ToArray()[r].WorkerLabel);
}).ToDictionary(tup => tup.Item1, tup => tup.Item2);
}
public List<Datum> BuildDataFromAssignedLabels(Dictionary<string, int?> AssignedLabels, IList<Datum> OriginalData)
{
List<Datum> data = new List<Datum>();
string firstWorkerId = WorkerIndexToId[0];
foreach (var entry in AssignedLabels)
{
var datum = new Datum();
datum.TaskId = entry.Key;
datum.GoldLabel = entry.Value;
datum.WorkerLabel = (int)entry.Value;
datum.WorkerId = firstWorkerId;
datum.BodyText = OriginalData.Where(d => d.TaskId == entry.Key).First().BodyText;
data.Add(datum);
}
if (data.Count == 0)
Console.WriteLine("*** Warning: There are no gold labels in the dataset ***");
return data;
}
/// <summary>
/// For each task, gets the majority vote label if it is unique.
/// </summary>
/// <returns>The list of majority vote labels.</returns>
public int?[] GetMajorityVotesPerTaskIndex()
{
return Data.GroupBy(d => TaskIdToIndex[d.TaskId]).
OrderBy(g => g.Key).
Select(t => t.GroupBy(d => d.WorkerLabel - LabelMin).
Select(g => new { label = g.Key, count = g.Count() })).
Select(arr =>
{
int max = arr.Max(a => a.count);
int[] majorityLabs = arr.Where(a => a.count == max).Select(a => a.label).ToArray();
if (majorityLabs.Length == 1)
return (int?)majorityLabs[0];
//return random label;
return majorityLabs[0];
}).ToArray();
}
/// <summary>
/// For each task Id, gets the majority vote label if it is unique.
/// </summary>
/// <returns>The dictionary of majority vote labels indexed by task id.</returns>
public Dictionary<string, int?> GetMajorityVotesPerTaskId(IList<Datum> data)
{
Dictionary<string, int?> majorityVotesPerTaskId = new Dictionary<string, int?>();
var majorityVotes = GetMajorityVotesPerTaskIndex();
foreach (var d in data)
{
if (!majorityVotesPerTaskId.ContainsKey(d.TaskId))
majorityVotesPerTaskId[d.TaskId] = majorityVotes[TaskIdToIndex[d.TaskId]];
}
return majorityVotesPerTaskId;
}
}
/// <summary>
/// Data mapping class. This class manages the mapping between the data (which is
/// in the form of task, worker ids, and labels) and the model data (which is in term of indices).
/// </summary>
public class DataMappingWords : DataMapping
{
/// <summary>
/// The vocabulary
/// </summary>
public List<string> Vocabulary;
/// <summary>
/// The size of the vocabulary.
/// </summary>
public int WordCount
{
get
{
return Vocabulary.Count();
}
}
public int[] WordCountsPerTaskIndex;
public int[][] WordIndicesPerTaskIndex;
public string[] CFLabelName = { "Negative", "Neutral", "Positive", "NotRelated", "Unknown" };
public string[] SPLabelName = { "Negative", "Positive" };
public DataMappingWords(
IEnumerable<Datum> data,
List<string> vocab,
int[] wordCountPerTaskIndex = null,
int[][] wordIndicesPerTaskIndex = null,
bool buildFullMapping = false)
: base(data)
{
Vocabulary = vocab;
if (wordCountPerTaskIndex == null)
GetWordIndicesAndCountsPerTaskIndex(data, out WordIndicesPerTaskIndex, out WordCountsPerTaskIndex);
else
{
WordCountsPerTaskIndex = wordCountPerTaskIndex;
WordIndicesPerTaskIndex = wordIndicesPerTaskIndex;
}
if (buildFullMapping) // Use task ids as worker ids
{
TaskIndexToId = data.Select(d => d.TaskId).Distinct().ToArray();
TaskIdToIndex = TaskIndexToId.Select((id, idx) => new KeyValuePair<string, int>(id, idx)).ToDictionary(x => x.Key, y => y.Value);
}
}
/// <summary>
/// Returns the matrix of the task indices (columns) of each worker (rows).
/// </summary>
/// <param name="data">The data.</param>
/// <param name="wordIndicesPerTaskIndex">Matrix of word indices for each tash index</param>
/// <param name="wordCountsPerTaskIndex">Matrix of word counts for each task index</param>
/// <returns>The matrix of the word indices (columns) of each task (rows).</returns>
public void GetWordIndicesAndCountsPerTaskIndex(IEnumerable<Datum> data, out int[][] wordIndicesPerTaskIndex, out int[] wordCountsPerTaskIndex)
{
wordIndicesPerTaskIndex = new int[TaskCount][];
wordCountsPerTaskIndex = new int[TaskCount];
string[] corpus = new string[TaskCount];
// Dictionary keyed by task Id, with randomly order labelings
var groupedRandomisedData =
data.GroupBy(d => d.TaskId).
Select(g =>
{
var arr = g.ToArray();
int cnt = arr.Length;
var perm = Rand.Perm(cnt);
return new
{
key = g.Key,
arr = g.Select((t, i) => arr[perm[i]]).ToArray()
};
}).ToDictionary(a => a.key, a => a.arr);
foreach (var kvp in groupedRandomisedData)
{
corpus[TaskIdToIndex[kvp.Key]] = kvp.Value.First().BodyText;
}
wordIndicesPerTaskIndex = TFIDFClass.GetWordIndexStemmedDocs(corpus, Vocabulary);
wordCountsPerTaskIndex = wordIndicesPerTaskIndex.Select(t => t.Length).ToArray();
}
}
}

Просмотреть файл

@ -0,0 +1,263 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.Collections.ObjectModel;
using System.Linq;
namespace CrowdsourcingWithWords
{
/// <summary>
/// Receiver Operating Characteristic (ROC) Curve
/// </summary>
/// <remarks>
/// In signal detection theory, a receiver operating characteristic (ROC), or simply
/// ROC curve, is a graphical plot of the sensitivity vs. (1 specificity) for a
/// binary classifier system as its discrimination threshold is varied.
///
/// References:
/// http://en.wikipedia.org/wiki/Receiver_operating_characteristic
/// http://www.anaesthetist.com/mnm/stats/roc/Findex.htm
/// http://radiology.rsna.org/content/148/3/839.full.pdf
/// </remarks>
public class ReceiverOperatingCharacteristic
{
private double area;
// The actual, measured data
private double[] measurement;
// The data, as predicted by a test
private double[] prediction;
// The real number of positives and negatives in the measured (actual) data
private int positiveCount;
private int negativeCount;
// The values which represent positive and negative values in our
// measurement data (such as presence or absence of some disease)
double dtrue;
double dfalse;
// The collection to hold our curve point information
public PointCollection collection;
/// <summary>
/// Constructs a new Receiver Operating Characteristic model
/// </summary>
/// <param name="measurement">An array of binary values. Typically 0 and 1, or -1 and 1, indicating negative and positive cases, respectively.</param>
/// <param name="prediction">An array of continuous values trying to approximate the measurement array.</param>
public ReceiverOperatingCharacteristic(double[] measurement, double[] prediction)
{
this.measurement = measurement;
this.prediction = prediction;
// Determine which numbers correspond to each binary category
dtrue = measurement.Min();
dfalse = measurement.Max();
// Count the real number of positive and negative cases
this.positiveCount = measurement.Count(m => m == dtrue);
// Negative cases is just the number of cases minus the number of positives
this.negativeCount = this.measurement.Length - this.positiveCount;
}
#region Public Methods
/// <summary>
/// Computes a ROC curve with 1/increment points
/// </summary>
/// <param name="increment">The increment over the previous point for each point in the curve.</param>
public void Compute(double increment)
{
List<Point> points = new List<Point>();
double cutoff;
// Create the curve, computing a point for each cutoff value
for (cutoff = dfalse; cutoff <= dtrue; cutoff += increment)
{
points.Add(ComputePoint(cutoff));
}
if (cutoff < dtrue) points.Add(ComputePoint(dtrue));
// Sort the curve by descending specificity
points.Sort((a, b) => a.Specificity.CompareTo(b.Specificity));
// Create the point collection
this.collection = new PointCollection(points.ToArray());
// Calculate area and error associated with this curve
this.area = calculateAreaUnderCurve();
calculateStandardError();
}
Point ComputePoint(double threshold)
{
int truePositives = 0;
int trueNegatives = 0;
for (int i = 0; i < this.measurement.Length; i++)
{
bool measured = (this.measurement[i] == dtrue);
bool predicted = (this.prediction[i] >= threshold);
// If the prediction equals the true measured value
if (predicted == measured)
{
// We have a hit. Now we have to see
// if it was a positive or negative hit
if (predicted)
truePositives++; // Positive hit
else trueNegatives++;// Negative hit
}
}
// The other values can be computed from available variables
int falsePositives = negativeCount - trueNegatives;
int falseNegatives = positiveCount - truePositives;
return new Point(this, threshold,
truePositives, trueNegatives,
falsePositives, falseNegatives);
}
#endregion
#region Private Methods
/// <summary>
/// Calculates the area under the ROC curve using the trapezium method
/// </summary>
private double calculateAreaUnderCurve()
{
double sum = 0.0;
for (int i = 0; i < collection.Count - 1; i++)
{
// Obs: False Positive Rate = (1-specificity)
var tpz = collection[i].Sensitivity + collection[i + 1].Sensitivity;
tpz = tpz * (collection[i].FalsePositiveRate - collection[i + 1].FalsePositiveRate) / 2.0;
sum += tpz;
}
return sum;
}
/// <summary>
/// Calculates the standard error associated with this curve
/// </summary>
private double calculateStandardError()
{
double A = area;
// real positive cases
int Na = positiveCount;
// real negative cases
int Nn = negativeCount;
double Q1 = A / (2.0 - A);
double Q2 = 2 * A * A / (1.0 + A);
return Math.Sqrt((A * (1.0 - A) +
(Na - 1.0) * (Q1 - A * A) +
(Nn - 1.0) * (Q2 - A * A)) / (Na * Nn));
}
#endregion
#region Nested Classes
/// <summary>
/// The confusion matrix for the classified instances
/// </summary>
public class ConfusionMatrix
{
// 2x2 confusion matrix
private int truePositives;
private int trueNegatives;
private int falsePositives;
private int falseNegatives;
/// <summary>
/// Constructs a new Confusion Matrix.
/// </summary>
public ConfusionMatrix(int truePositives, int trueNegatives,
int falsePositives, int falseNegatives)
{
this.truePositives = truePositives;
this.trueNegatives = trueNegatives;
this.falsePositives = falsePositives;
this.falseNegatives = falseNegatives;
}
/// <summary>
/// Sensitivity, also known as True Positive Rate
/// </summary>
/// <remarks>
/// Sensitivity = TPR = TP / (TP + FN)
/// </remarks>
public double Sensitivity => (double)truePositives / (truePositives + falseNegatives);
/// <summary>
/// Specificity, also known as True Negative Rate
/// </summary>
/// <remarks>
/// Specificity = TNR = TN / (FP + TN)
/// or also as: TNR = (1-False Positive Rate)
/// </remarks>
public double Specificity => (double)trueNegatives / (trueNegatives + falsePositives);
/// <summary>
/// False Positive Rate, also known as false alarm rate.
/// </summary>
/// <remarks>
/// It can be calculated as: FPR = FP / (FP + TN)
/// or also as: FPR = (1-specifity)
/// </remarks>
public double FalsePositiveRate => (double)falsePositives / (falsePositives + trueNegatives);
}
/// <summary>
/// Object to hold information about a Receiver Operating Characteristic Curve Point
/// </summary>
public class Point : ConfusionMatrix
{
/// <summary>
/// Constructs a new Receiver Operating Characteristic point.
/// </summary>
internal Point(ReceiverOperatingCharacteristic curve, double cutoff,
int truePositives, int trueNegatives, int falsePositives, int falseNegatives)
: base(truePositives, trueNegatives, falsePositives, falseNegatives)
{
this.Cutoff = cutoff;
}
/// <summary>
/// Gets the cutoff value (discrimination threshold) for this point.
/// </summary>
public double Cutoff { get; private set; }
}
/// <summary>
/// Represents a Collection of Receiver Operating Characteristic (ROC) Curve points.
/// This class cannot be instantiated.
/// </summary>
public class PointCollection : ReadOnlyCollection<Point>
{
internal PointCollection(Point[] points)
: base(points)
{
}
}
#endregion
}
}

Просмотреть файл

@ -0,0 +1,519 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using Microsoft.ML.Probabilistic.Distributions;
using Microsoft.ML.Probabilistic.Math;
using Microsoft.ML.Probabilistic.Utilities;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace CrowdsourcingWithWords
{
/// <summary>
/// Results class containing posteriors and predictions.
/// </summary>
public class Results
{
/// <summary>
/// The posterior of the true label for each task.
/// </summary>
public Dictionary<string, Discrete> TrueLabel
{
get;
protected set;
}
/// <summary>
/// The predicted label for each task when doing simulations from the current
/// model state. It avoids overwriting the true label posterior.
/// </summary>
public Dictionary<string, Discrete> LookAheadTrueLabel
{
get;
protected set;
}
/// <summary>
/// The posterior for the constraint that allows online learning for the true label variable.
/// </summary>
public Dictionary<string, Discrete> TrueLabelConstraint
{
get;
protected set;
}
/// <summary>
/// The predicted label for each task
/// </summary>
public Dictionary<string, int?> PredictedLabel
{
get;
protected set;
}
/// <summary>
/// The probabilities that generate the true label of all the tasks.
/// </summary>
public Dirichlet BackgroundLabelProb
{
get;
protected set;
}
/// <summary>
/// The posterior of the confusion matrix of each worker.
/// </summary>
public Dictionary<string, Dirichlet[]> WorkerConfusionMatrix
{
get;
protected set;
}
/// <summary>
/// The look-ahead posterior of the confusion matrix of each worker obtained after simulating
/// a new label in look-ahead run mode.
/// </summary>
public Dictionary<string, Dirichlet[]> LookAheadWorkerConfusionMatrix
{
get;
protected set;
}
/// <summary>
/// The predictive probabilities of the labels produced by each worker.
/// </summary>
public Dictionary<string, Dictionary<string, Discrete>> WorkerPrediction
{
get;
protected set;
}
/// <summary>
/// The community membership probabilities of each worker.
/// </summary>
public Dictionary<string, Discrete> WorkerCommunity
{
get;
protected set;
}
/// <summary>
/// The confusion matrix of each community.
/// </summary>
public Dirichlet[][] CommunityConfusionMatrix
{
get;
protected set;
}
/// <summary>
/// The score matrix of each community.
/// </summary>
public VectorGaussian[][] CommunityScoreMatrix
{
get;
protected set;
}
/// <summary>
/// The posterior for the constraint that allows online learning for worker confusion matrices
/// int the community model.
/// </summary>
public Dictionary<string, VectorGaussian[]> WorkerScoreMatrixConstraint
{
get;
protected set;
}
/// <summary>
/// The probabilities that generate the community memberships of all the workers.
/// </summary>
public Dirichlet CommunityProb
{
get;
protected set;
}
/// <summary>
/// The posterior for the constraint that allows online learning for community membership.
/// int the community model.
/// </summary>
public Dictionary<string, Discrete> CommunityConstraint
{
get;
protected set;
}
/// <summary>
/// Model evidence.
/// </summary>
public Bernoulli ModelEvidence
{
get;
protected set;
}
/// <summary>
/// The data mapping.
/// </summary>
public DataMapping Mapping
{
get;
set;
}
/// <summary>
/// The full data mapping.
/// </summary>
public DataMapping FullMapping
{
get;
set;
}
/// <summary>
/// The gold labels of each task. The gold label type is nullable to
/// support the (usual) situation without labels.
/// </summary>
public Dictionary<string, int?> GoldLabels
{
get;
protected set;
}
/// <summary>
/// The accuracy of the current true label predictions.
/// </summary>
public double Accuracy
{
get;
private set;
}
/// <summary>
/// The accuracy of the worker labels.
/// </summary>
public double WorkerLabelAccuracy
{
get;
protected set;
}
/// <summary>
/// The negative log probability density (NLPD) scores of the current true label predictions.
/// </summary>
public double NegativeLogProb
{
get;
private set;
}
/// <summary>
/// The average recall of the current true label predictions.
/// </summary>
public double AvgRecall
{
get;
private set;
}
/// <summary>
/// The confusion matrix of the predicted true labels against the gold labels
/// The rows are the gold labels and the columns are the predicted labels.
/// </summary>
public double[,] ModelConfusionMatrix
{
get;
private set;
}
/// <summary>
/// The number of communities.
/// </summary>
public int CommunityCount
{
get;
private set;
}
public ReceiverOperatingCharacteristic.ConfusionMatrix BinaryConfusionMatrix
{
get;
private set;
}
public ReceiverOperatingCharacteristic RocCurve
{
get;
private set;
}
public List<double> trueBinaryLabel;
public List<double> probTrueBinaryLabel;
public enum RunMode
{
ClearResults,
BatchTraining,
IncrementalExperiment,
OnlineExperiment,
LookAheadExperiment,
LoadAndUseCommunityPriors,
Prediction,
};
protected virtual void ClearResults()
{
BackgroundLabelProb = Dirichlet.Uniform(Mapping.LabelCount);
WorkerConfusionMatrix = new Dictionary<string, Dirichlet[]>();
WorkerPrediction = new Dictionary<string, Dictionary<String, Discrete>>();
WorkerCommunity = new Dictionary<string, Discrete>();
TrueLabel = new Dictionary<string, Discrete>();
TrueLabelConstraint = new Dictionary<string, Discrete>();
CommunityConfusionMatrix = null;
WorkerScoreMatrixConstraint = new Dictionary<string, VectorGaussian[]>();
CommunityProb = null;
CommunityScoreMatrix = null;
CommunityConstraint = new Dictionary<string, Discrete>();
LookAheadTrueLabel = new Dictionary<string, Discrete>();
LookAheadWorkerConfusionMatrix = new Dictionary<string, Dirichlet[]>();
ModelEvidence = new Bernoulli(0.5);
PredictedLabel = new Dictionary<string, int?>();
}
protected virtual void UpdateResults(BCCPosteriors posteriors, RunMode mode)
{
if (mode == RunMode.LookAheadExperiment)
{
if (posteriors.TrueLabel != null)
{
for (int t = 0; t < posteriors.TrueLabel.Length; t++)
{
LookAheadTrueLabel[Mapping.TaskIndexToId[t]] = posteriors.TrueLabel[t];
}
}
if (posteriors.WorkerConfusionMatrix != null)
{
for (int w = 0; w < posteriors.WorkerConfusionMatrix.Length; w++)
{
LookAheadWorkerConfusionMatrix[Mapping.WorkerIndexToId[w]] = posteriors.WorkerConfusionMatrix[w];
}
}
}
else if (mode == RunMode.Prediction)
{
if (posteriors.WorkerConfusionMatrix != null)
{
for (int w = 0; w < posteriors.WorkerConfusionMatrix.Length; w++)
{
WorkerPrediction[Mapping.WorkerIndexToId[w]] = new Dictionary<string, Discrete>();
for (int tw = 0; tw < posteriors.WorkerPrediction[w].Length; tw++)
{
WorkerPrediction[Mapping.WorkerIndexToId[w]][Mapping.TaskIndexToId[tw]] = posteriors.WorkerPrediction[w][tw];
}
}
}
}
else
{
// Update results for BCC
BackgroundLabelProb = posteriors.BackgroundLabelProb;
if (posteriors.WorkerConfusionMatrix != null)
{
for (int w = 0; w < posteriors.WorkerConfusionMatrix.Length; w++)
{
WorkerConfusionMatrix[Mapping.WorkerIndexToId[w]] = posteriors.WorkerConfusionMatrix[w];
}
}
if (posteriors.TrueLabel != null)
{
for (int t = 0; t < posteriors.TrueLabel.Length; t++)
{
TrueLabel[Mapping.TaskIndexToId[t]] = posteriors.TrueLabel[t];
}
}
if (posteriors.TrueLabelConstraint != null)
{
for (int t = 0; t < posteriors.TrueLabelConstraint.Length; t++)
{
TrueLabelConstraint[Mapping.TaskIndexToId[t]] = posteriors.TrueLabelConstraint[t];
}
}
this.ModelEvidence = posteriors.Evidence;
}
}
/// <summary>
/// Updates the accuracy using the current results.
/// </summary>
protected virtual void UpdateAccuracy()
{
double nlpdThreshold = -Math.Log(0.001);
int labelCount = TrueLabel.First(kvp => kvp.Value != null).Value.Dimension;
var confusionMatrix = Util.ArrayInit(labelCount, labelCount, (i, j) => 0.0);
int correct = 0;
double logProb = 0.0;
int goldX = 0;
// Only for binary labels
if (Mapping.LabelCount == 2)
{
trueBinaryLabel = new List<double>();
probTrueBinaryLabel = new List<double>();
}
foreach (var kvp in GoldLabels)
{
if (kvp.Value == null)
continue;
// We have a gold label
goldX++;
Discrete trueLabel = null;
if (TrueLabel.ContainsKey(kvp.Key))
trueLabel = TrueLabel[kvp.Key];
if (trueLabel == null)
{
trueLabel = Discrete.Uniform(Mapping.LabelCount);
//continue; // No inferred label
}
var probs = trueLabel.GetProbs();
double max = probs.Max();
var predictedLabels = probs.Select((p, i) => new
{
prob = p,
idx = i
}).Where(a => a.prob == max).Select(a => a.idx).ToArray();
int predictedLabel = predictedLabels.Length == 1 ? predictedLabels[0] : predictedLabels[Rand.Int(predictedLabels.Length)];
this.PredictedLabel[kvp.Key] = predictedLabel;
int goldLabel = kvp.Value.Value;
confusionMatrix[goldLabel, predictedLabel] = confusionMatrix[goldLabel, predictedLabel] + 1.0;
var nlp = -trueLabel.GetLogProb(goldLabel);
if (nlp > nlpdThreshold)
nlp = nlpdThreshold;
logProb += nlp;
if (trueBinaryLabel != null)
{
trueBinaryLabel.Add(goldLabel);
probTrueBinaryLabel.Add(probs[goldLabel]);
}
}
Accuracy = correct / (double)goldX;
NegativeLogProb = logProb / goldX;
ModelConfusionMatrix = confusionMatrix;
// Average recall
double sumRec = 0;
for (int i = 0; i < labelCount; i++)
{
double classSum = 0;
for (int j = 0; j < labelCount; j++)
{
classSum += confusionMatrix[i, j];
}
sumRec += confusionMatrix[i, i] / classSum;
}
AvgRecall = sumRec / labelCount;
// WorkerLabelAccuracy: Perc. agreement between worker label and gold label
int sumAcc = 0;
var LabelSet = Mapping.DataWithGold;
int numLabels = LabelSet.Count();
foreach (var datum in LabelSet)
{
sumAcc += datum.WorkerLabel == datum.GoldLabel ? 1 : 0;
}
WorkerLabelAccuracy = sumAcc / (double)numLabels;
if (trueBinaryLabel != null && trueBinaryLabel.Count > 0)
{
RocCurve = new ReceiverOperatingCharacteristic(trueBinaryLabel.ToArray(), probTrueBinaryLabel.ToArray());
RocCurve.Compute(0.001);
BinaryConfusionMatrix = new ReceiverOperatingCharacteristic.ConfusionMatrix((int)confusionMatrix[1, 1], (int)confusionMatrix[0, 0], (int)confusionMatrix[0, 1], (int)confusionMatrix[1, 0]);
}
}
public static void WriteConfusionMatrix(StreamWriter writer, string worker, Dirichlet[] confusionMatrix)
{
int labelCount = confusionMatrix.Length;
var meanConfusionMatrix = confusionMatrix.Select(cm => cm.GetMean()).ToArray();
var printableConfusionMatrix = Util.ArrayInit(labelCount, labelCount, (i, j) => meanConfusionMatrix[i][j]);
WriteWorkerConfusionMatrix(writer, worker, printableConfusionMatrix);
}
public static void WriteWorkerConfusionMatrix(StreamWriter writer, string worker, double[,] confusionMatrix)
{
int labelCount = confusionMatrix.GetLength(0);
writer.WriteLine(worker);
for (int j = 0; j < labelCount; j++)
writer.Write(",{0}", j);
writer.WriteLine();
for (int i = 0; i < labelCount; i++)
{
writer.Write(i);
for (int j = 0; j < labelCount; j++)
writer.Write(",{0:0.0000}", confusionMatrix[i, j]);
writer.WriteLine();
}
}
public virtual void WriteResults(StreamWriter writer, bool writeCommunityParameters, bool writeWorkerParameters, bool writeWorkerCommunities, IList<Datum> data = null)
{
if (writeCommunityParameters && this.CommunityConfusionMatrix != null)
{
for (int communityIndex = 0; communityIndex < this.CommunityConfusionMatrix.Length; communityIndex++)
{
WriteConfusionMatrix(writer, "Community" + communityIndex, this.CommunityConfusionMatrix[communityIndex]);
}
}
if (writeWorkerParameters && this.WorkerConfusionMatrix != null)
{
foreach (var kvp in this.WorkerConfusionMatrix.Distinct().Take(5))
{
WriteConfusionMatrix(writer, kvp.Key, kvp.Value);
}
}
if (this.TrueLabel != null)
{
foreach (var kvp in this.TrueLabel.OrderBy(kvp => kvp.Value.GetProbs()[0]))
{
if (data != null)
{
var taskLabels = data.Where(d => d.TaskId == kvp.Key).Select(l => l.WorkerLabel);
var pos = taskLabels.Where(l => l == 1);
var neg = taskLabels.Where(l => l == 0);
int numPos = pos.Count();
int numNeg = neg.Count();
writer.WriteLine($"{kvp.Key}:\t{kvp.Value}\tnum_neg: {numNeg}\t num_pos: {numPos}");
}
else
{
writer.WriteLine($"{kvp.Key}:\t{kvp.Value}");
}
}
}
}
}
}

Просмотреть файл

@ -0,0 +1,236 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using Microsoft.ML.Probabilistic.Distributions;
using Microsoft.ML.Probabilistic.Math;
using Microsoft.ML.Probabilistic.Utilities;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
namespace CrowdsourcingWithWords
{
/// <summary>
/// Results class containing posteriors and predictions of BCCWords.
/// </summary>
public class ResultsWords : Results
{
/// <summary>
/// The posterior of the word probabilities for each true label.
/// </summary>
public Dirichlet[] ProbWords
{
get;
private set;
}
/// <summary>
/// The vocabulary
/// </summary>
public List<string> Vocabulary
{
get;
set;
}
/// <summary>
/// Creates an object for storing the inference results of BCCWords
/// </summary>
/// <param name="data">The data</param>
/// <param name="vocabulary">The vocabulary</param>
public ResultsWords(IList<Datum> data, List<string> vocabulary)
{
if (vocabulary == null)
{
// Build vocabulary
Console.Write("Building vocabulary...");
Stopwatch stopwatch = new Stopwatch();
stopwatch.Start();
string[] corpus = data.Select(d => d.BodyText).Distinct().ToArray();
Vocabulary = BuildVocabularyFromCorpus(corpus);
Console.WriteLine("done. Elapsed time: {0}", stopwatch.Elapsed);
}
// Build data mapping
Vocabulary = vocabulary;
this.Mapping = new DataMappingWords(data, vocabulary);
this.GoldLabels = Mapping.GetGoldLabelsPerTaskId();
}
/// <summary>
/// Runs the majority vote method on the data.
/// </summary>
/// <param name="modelName"></param>
/// <param name="data">The data</param>
/// <param name="mode"></param>
/// <param name="calculateAccuracy">Compute the accuracy (true).</param>
/// <param name="fullData"></param>
/// <param name="model"></param>
/// <param name="useMajorityVote"></param>
/// <param name="useRandomLabel"></param>
/// <returns>The updated results</returns>
public void RunBCCWords(string modelName,
IList<Datum> data,
IList<Datum> fullData,
BCCWords model,
RunMode mode,
bool calculateAccuracy,
bool useMajorityVote = false,
bool useRandomLabel = false)
{
DataMappingWords MappingWords = null;
if (FullMapping == null)
FullMapping = new DataMapping(fullData);
if (Mapping == null)
{
// Build vocabulary
Console.Write("Building vocabulary...");
Stopwatch stopwatch = new Stopwatch();
stopwatch.Start();
string[] corpus = data.Select(d => d.BodyText).Distinct().ToArray();
Vocabulary = BuildVocabularyFromCorpus(corpus);
Console.WriteLine("done. Elapsed time: {0}", stopwatch.Elapsed);
// Build data mapping
this.Mapping = new DataMappingWords(data, MappingWords.Vocabulary);
MappingWords = Mapping as DataMappingWords;
this.GoldLabels = MappingWords.GetGoldLabelsPerTaskId();
}
MappingWords = Mapping as DataMappingWords;
int[] trueLabels = null;
if (useMajorityVote)
{
if (MappingWords != null)
{
var majorityLabel = MappingWords.GetMajorityVotesPerTaskId(data);
trueLabels = Util.ArrayInit(FullMapping.TaskCount, i => majorityLabel.ContainsKey(Mapping.TaskIndexToId[i]) ? (int)majorityLabel[Mapping.TaskIndexToId[i]] : Rand.Int(Mapping.LabelMin, Mapping.LabelMax + 1));
data = MappingWords.BuildDataFromAssignedLabels(majorityLabel, data);
}
}
if (useRandomLabel)
{
var randomLabels = MappingWords.GetRandomLabelPerTaskId(data);
data = MappingWords.BuildDataFromAssignedLabels(randomLabels, data);
}
var labelsPerWorkerIndex = MappingWords.GetLabelsPerWorkerIndex(data);
var taskIndicesPerWorkerIndex = MappingWords.GetTaskIndicesPerWorkerIndex(data);
// Create model
ClearResults();
model.CreateModel(MappingWords.TaskCount, MappingWords.LabelCount, MappingWords.WordCount);
// Run model inference
BCCWordsPosteriors posteriors = model.InferPosteriors(labelsPerWorkerIndex, taskIndicesPerWorkerIndex, MappingWords.WordIndicesPerTaskIndex, MappingWords.WordCountsPerTaskIndex, trueLabels);
// Update results
UpdateResults(posteriors, mode);
// Compute accuracy
if (calculateAccuracy)
{
UpdateAccuracy();
}
}
/// <summary>
/// Select high TFIDF terms
/// </summary>
/// <param name="corpus">array of terms</param>
/// <param name="tfidf_threshold">TFIDF threshold</param>
/// <returns></returns>
private static List<string> BuildVocabularyFromCorpus(string[] corpus, double tfidf_threshold = 0.8)
{
List<string> vocabulary;
double[][] inputs = TFIDFClass.Transform(corpus, out vocabulary, 0);
inputs = TFIDFClass.Normalize(inputs);
// Select high TF_IDF terms
List<string> vocabularyTfidf = new List<string>();
for (int index = 0; index < inputs.Length; index++)
{
var sortedTerms = inputs[index].Select((x, i) => new KeyValuePair<string, double>(vocabulary[i], x)).OrderByDescending(x => x.Value).ToList();
vocabularyTfidf.AddRange(sortedTerms.Where(entry => entry.Value > tfidf_threshold).Select(k => k.Key).ToList());
}
return vocabulary.Distinct().ToList();
}
protected override void ClearResults()
{
BackgroundLabelProb = Dirichlet.Uniform(Mapping.LabelCount);
WorkerConfusionMatrix = new Dictionary<string, Dirichlet[]>();
WorkerPrediction = new Dictionary<string, Dictionary<String, Discrete>>();
WorkerCommunity = new Dictionary<string, Discrete>();
TrueLabel = new Dictionary<string, Discrete>();
PredictedLabel = new Dictionary<string, int?>();
TrueLabelConstraint = new Dictionary<string, Discrete>();
CommunityConfusionMatrix = null;
WorkerScoreMatrixConstraint = new Dictionary<string, VectorGaussian[]>();
CommunityProb = null;
CommunityScoreMatrix = null;
CommunityConstraint = new Dictionary<string, Discrete>();
LookAheadTrueLabel = new Dictionary<string, Discrete>();
LookAheadWorkerConfusionMatrix = new Dictionary<string, Dirichlet[]>();
ModelEvidence = new Bernoulli(0.5);
ProbWords = null;
}
/// <summary>
/// Writes various results to a StreamWriter.
/// </summary>
/// <param name="writer">A StreamWriter instance.</param>
/// <param name="writeCommunityParameters">Set true to write community parameters.</param>
/// <param name="writeWorkerParameters">Set true to write worker parameters.</param>
/// <param name="writeWorkerCommunities">Set true to write worker communities.</param>
/// <param name="writeProbWords">Set true to write word probabilities</param>
/// <param name="topWords">Number of words to select</param>
public void WriteResults(StreamWriter writer, bool writeCommunityParameters, bool writeWorkerParameters, bool writeWorkerCommunities, bool writeProbWords, int topWords = 30)
{
base.WriteResults(writer, writeCommunityParameters, writeWorkerCommunities, writeWorkerCommunities);
DataMappingWords MappingWords = Mapping as DataMappingWords;
if (writeProbWords && this.ProbWords != null)
{
int NumClasses = ProbWords.Length;
for (int c = 0; c < NumClasses; c++)
{
if (MappingWords != null && MappingWords.WorkerCount > 300) // Assume it's CF
writer.WriteLine("Class {0}", MappingWords.CFLabelName[c]);
else
if (MappingWords != null)
writer.WriteLine("Class {0}", MappingWords.SPLabelName[c]);
Vector probs = ProbWords[c].GetMean();
var probsDictionary = probs.Select((value, index) => new KeyValuePair<string, double>(MappingWords.Vocabulary[index], Math.Log(value))).OrderByDescending(x => x.Value).ToArray();
for (int w = 0; w < topWords; w++)
{
writer.WriteLine($"\t{probsDictionary[w].Key}: \t{probsDictionary[w].Value:0.000}");
}
}
}
}
/// <summary>
/// Build a vocabulary of terms for a subset of text snippets extracted from the data
/// </summary>
/// <param name="data">the data</param>
/// <returns></returns>
public static List<string> BuildVocabularyOnSubdata(List<Datum> data)
{
Console.WriteLine("Building vocabulary");
var subData = data.Where((k, i) => i < 20000).ToList();
string[] corpus = subData.Select(d => d.BodyText).Distinct().ToArray();
var vocabularyOnSubData = BuildVocabularyFromCorpus(corpus);
if (vocabularyOnSubData.Count > 300)
return vocabularyOnSubData.GetRange(0, 300);
else
return vocabularyOnSubData;
}
}
}

Просмотреть файл

@ -0,0 +1,319 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
using Microsoft.ML.Probabilistic.Utilities;
namespace CrowdsourcingWithWords
{
/// <summary>
/// Copyright (c) 2013 Kory Becker http://www.primaryobjects.com/kory-becker.aspx
///
/// Permission is hereby granted, free of charge, to any person obtaining
/// a copy of this software and associated documentation files (the
/// "Software"), to deal in the Software without restriction, including
/// without limitation the rights to use, copy, modify, merge, publish,
/// distribute, sublicense, and/or sell copies of the Software, and to
/// permit persons to whom the Software is furnished to do so, subject to
/// the following conditions:
///
/// The above copyright notice and this permission notice shall be
/// included in all copies or substantial portions of the Software.
///
/// Description:
/// Performs a TF*IDF (Term Frequency * Inverse Document Frequency) transformation on an array of documents.
/// Each document string is transformed into an array of doubles, cooresponding to their associated TF*IDF values.
///
/// Usage:
/// string[] documents = LoadYourDocuments();
///
/// double[][] inputs = TFIDF.Transform(documents);
/// inputs = TFIDF.Normalize(inputs);
///
/// </summary>
public static class TFIDFClass
{
/// <summary>
/// Document vocabulary, containing each word's IDF value.
/// </summary>
private static Dictionary<string, double> _vocabularyIDF = new Dictionary<string, double>();
/// <summary>
/// Transforms a list of documents into their associated TF*IDF values.
/// If a vocabulary does not yet exist, one will be created, based upon the documents' words.
/// </summary>
/// <param name="documents">string[]</param>
/// <param name="vocabulary">The vocabulary</param>
/// <param name="vocabularyThreshold">Minimum number of occurences of the term within all documents</param>
/// <returns>double[][]</returns>
public static double[][] Transform(string[] documents, out List<string> vocabulary, int vocabularyThreshold = 3)
{
List<List<string>> stemmedDocs;
// Get the vocabulary and stem the documents at the same time.
vocabulary = GetVocabulary(documents, out stemmedDocs, vocabularyThreshold);
if (_vocabularyIDF.Count == 0)
{
// Calculate the IDF for each vocabulary term.
_vocabularyIDF = vocabulary.ToDictionary(term => term, term =>
{
double numberOfDocsContainingTerm = stemmedDocs.Count(d => d.Contains(term));
return Math.Log(stemmedDocs.Count / (1 + numberOfDocsContainingTerm));
});
}
// Transform each document into a vector of tfidf values.
return TransformToTFIDFVectors(stemmedDocs, _vocabularyIDF);
}
/// <summary>
/// Converts a list of stemmed documents (lists of stemmed words) and their associated vocabulary + idf values, into an array of TF*IDF values.
/// </summary>
/// <param name="stemmedDocs">List of List of string</param>
/// <param name="vocabularyIDF">Dictionary of string, double (term, IDF)</param>
/// <returns>double[][]</returns>
private static double[][] TransformToTFIDFVectors(List<List<string>> stemmedDocs, Dictionary<string, double> vocabularyIDF)
{
// Transform each document into a vector of tfidf values.
List<List<double>> vectors = new List<List<double>>();
foreach (var doc in stemmedDocs)
{
List<double> vector = new List<double>();
foreach (var vocab in vocabularyIDF)
{
// Term frequency = count how many times the term appears in this document.
double tf = doc.Where(d => d == vocab.Key).Count();
double tfidf = tf * vocab.Value;
vector.Add(tfidf);
}
vectors.Add(vector);
}
return vectors.Select(v => v.ToArray()).ToArray();
}
/// <summary>
/// Normalizes a TF*IDF array of vectors using L2-Norm.
/// Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2)
/// </summary>
/// <param name="vectors">double[][]</param>
/// <returns>double[][]</returns>
public static double[][] Normalize(double[][] vectors)
{
// Normalize the vectors using L2-Norm.
List<double[]> normalizedVectors = new List<double[]>();
foreach (var vector in vectors)
{
var normalized = Normalize(vector);
normalizedVectors.Add(normalized);
}
return normalizedVectors.ToArray();
}
/// <summary>
/// Normalizes a TF*IDF vector using L2-Norm.
/// Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2)
/// </summary>
/// <param name="vector">double[][]</param>
/// <returns>double[][]</returns>
public static double[] Normalize(double[] vector)
{
List<double> result = new List<double>();
double sumSquared = 0;
foreach (var value in vector)
{
sumSquared += value * value;
}
double SqrtSumSquared = Math.Sqrt(sumSquared);
foreach (var value in vector)
{
// L2-norm: Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2)
result.Add(value / SqrtSumSquared);
}
return result.ToArray();
}
#region Private Helpers
/// <summary>
/// Parses and tokenizes a list of documents, returning a vocabulary of words.
/// </summary>
/// <param name="docs">string[]</param>
/// <param name="stemmedDocs">List of List of string</param>
/// <param name="vocabularyThreshold"></param>
/// <returns>Vocabulary (list of strings)</returns>
private static List<string> GetVocabulary(string[] docs, out List<List<string>> stemmedDocs, int vocabularyThreshold)
{
List<string> vocabulary = new List<string>();
Dictionary<string, int> wordCountList = new Dictionary<string, int>();
stemmedDocs = new List<List<string>>();
var stopWordsFile = File.ReadAllLines(Path.Combine("Data","stopwords.txt"));
var stopWordsList = new List<string>(stopWordsFile).ToArray();
int docIndex = 0;
List<string> words = new List<string>();
foreach (var doc in docs)
{
List<string> stemmedDoc = new List<string>();
docIndex++;
if (docIndex % 10000 == 0)
{
Console.WriteLine("Processing " + docIndex + "/" + docs.Length);
}
string[] parts2 = Tokenize(doc.ToLower());
foreach (string part in parts2)
{
// Strip non-alphanumeric characters.
string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", "");
if (!stopWordsList.Contains(stripped.ToLower()))
{
try
{
var english = new EnglishWord(stripped);
string stem = english.Original;
words.Add(stripped);
if (stem.Length > 0)
{
// Build the word count list.
if (wordCountList.ContainsKey(stem))
{
wordCountList[stem]++;
}
else
{
wordCountList.Add(stem, 0);
}
stemmedDoc.Add(stem);
}
}
catch
{
// ignored
}
}
}
stemmedDocs.Add(stemmedDoc);
}
// Get the top words.
var vocabList = wordCountList.Where(w => w.Value >= vocabularyThreshold);
foreach (var item in vocabList)
{
vocabulary.Add(item.Key);
}
return vocabulary;
}
public static int[][] GetWordIndexStemmedDocs(string[] docs, List<string> vocabulary)
{
List<int>[] wordIndex = Util.ArrayInit(docs.Length, d => new List<int>());
int docIndex = 0;
foreach (var doc in docs)
{
if (doc != null)
{
string[] parts2 = Tokenize(doc.ToLower());
List<int> wordIndexDoc = new List<int>();
foreach (string part in parts2)
{
// Strip non-alphanumeric characters.
string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", "");
try
{
var english = new EnglishWord(stripped);
string stem = english.Stem;
if (vocabulary.Contains(stem))
{
wordIndexDoc.Add(vocabulary.IndexOf(stem));
}
}
catch
{
// ignored
}
}
wordIndex[docIndex] = wordIndexDoc.Distinct().ToList();
docIndex++;
}
}
return wordIndex.Select(list => list.Select(index => index).ToArray()).ToArray();
}
/// <summary>
/// Tokenizes a string, returning its list of words.
/// </summary>
/// <param name="text">string</param>
/// <returns>string[]</returns>
private static string[] Tokenize(string text)
{
// Strip all HTML.
text = Regex.Replace(text, "<[^<>]+>", "");
// Strip numbers.
text = Regex.Replace(text, "[0-9]+", "number");
// Strip urls.
text = Regex.Replace(text, @"(http|https)://[^\s]*", "httpaddr");
// Strip email addresses.
text = Regex.Replace(text, @"[^\s]+@[^\s]+", "emailaddr");
// Strip dollar sign.
text = Regex.Replace(text, "[$]+", "dollar");
// Strip usernames.
text = Regex.Replace(text, @"@[^\s]+", "username");
// Tokenize and also get rid of any punctuation
return text.Split(" @$/#.-:&*+=[]?!(){},''\">_<;%\\".ToCharArray());
}
#endregion
}
/// <summary>
/// This is a placeholder class. Replace with your own stemmer.
/// </summary>
public class EnglishWord
{
public readonly string Original;
public readonly string Stem;
public EnglishWord(string word)
{
Original = word;
Stem = word;
}
}
}

Просмотреть файл

@ -154,7 +154,6 @@
<DependentUpon>Resources.resx</DependentUpon>
<DesignTime>True</DesignTime>
</Compile>
<None Include="app.config" />
<None Include="Properties\Settings.settings">
<Generator>SettingsSingleFileGenerator</Generator>
<LastGenOutput>Settings.Designer.cs</LastGenOutput>

Просмотреть файл

@ -1,3 +0,0 @@
<?xml version="1.0"?>
<configuration>
<startup><supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2"/></startup></configuration>

Просмотреть файл

@ -2,14 +2,12 @@
<PropertyGroup>
<GenerateAssemblyInfo>false</GenerateAssemblyInfo>
<OutputType>Exe</OutputType>
<AssemblyName>InferNET101</AssemblyName>
<WarningLevel>4</WarningLevel>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<WarningsAsErrors />
<ErrorReport>prompt</ErrorReport>
<Prefer32Bit>true</Prefer32Bit>
<DefineConstants>TRACE</DefineConstants>
<RootNamespace>InferNET101</RootNamespace>
<Configurations>Debug;Release;DebugFull;DebugCore;ReleaseFull;ReleaseCore</Configurations>
</PropertyGroup>
<Choose>
@ -59,4 +57,4 @@
<Compile Include="..\..\Shared\SharedAssemblyFileVersion.cs" />
<Compile Include="..\..\Shared\SharedAssemblyInfo.cs" />
</ItemGroup>
</Project>
</Project>

Просмотреть файл

@ -1,3 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<startup><supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2"/></startup></configuration>

Просмотреть файл

@ -97,6 +97,7 @@ namespace LDAExample
vocabulary);
}
Console.WriteLine("Done. Press enter to exit.");
Console.ReadLine();
}

Просмотреть файл

@ -1,3 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<startup><supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2"/></startup></configuration>

Просмотреть файл

@ -147,7 +147,6 @@
<Generator>ResXFileCodeGenerator</Generator>
<LastGenOutput>Resources.Designer.cs</LastGenOutput>
</EmbeddedResource>
<None Include="app.config" />
<None Include="Properties\Settings.settings">
<Generator>SettingsSingleFileGenerator</Generator>
<LastGenOutput>Settings.Designer.cs</LastGenOutput>

Просмотреть файл

@ -1,3 +0,0 @@
<?xml version="1.0"?>
<configuration>
<startup><supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2"/></startup></configuration>

Просмотреть файл

@ -1,6 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2"/>
</startup>
</configuration>

Просмотреть файл

@ -118,7 +118,7 @@ namespace MotifFinder
motifPositionPosterior);
//// Keep the application alive until the user enters a keystroke
Console.WriteLine("Done. Press enter to exit.");
Console.ReadKey();
}

Просмотреть файл

@ -1,3 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<startup><supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2"/></startup></configuration>

Просмотреть файл

@ -1,3 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<startup><supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2"/></startup></configuration>

Просмотреть файл

@ -1,3 +0,0 @@
<?xml version="1.0"?>
<configuration>
<startup><supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2"/></startup></configuration>

Просмотреть файл

@ -3,9 +3,13 @@
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML.Probabilistic.Collections;
using Microsoft.ML.Probabilistic.Distributions;
using Microsoft.ML.Probabilistic.Math;
using Microsoft.ML.Probabilistic.Models;
using Microsoft.ML.Probabilistic.Utilities;
namespace Microsoft.ML.Probabilistic.Tutorials
{
@ -15,33 +19,34 @@ namespace Microsoft.ML.Probabilistic.Tutorials
public class BayesianPCAModel
{
// Inference engine
public InferenceEngine engine = null;
public InferenceEngine engine;
// Model variables
public Variable<int> vN = null;
public Variable<int> vD = null;
public Variable<int> vM = null;
public VariableArray2D<double> vData = null;
public VariableArray2D<double> vW = null;
public VariableArray2D<double> vZ = null;
public VariableArray2D<double> vT = null;
public VariableArray2D<double> vU = null;
public VariableArray<double> vMu = null;
public VariableArray<double> vPi = null;
public VariableArray<double> vAlpha = null;
public Variable<int> observationCount;
public Variable<int> featureCount;
public Variable<int> componentCount;
public VariableArray2D<double> data;
public VariableArray2D<double> W;
public VariableArray2D<Gaussian> initW;
public VariableArray2D<double> Z;
public VariableArray2D<double> T;
public VariableArray2D<double> U;
public VariableArray<double> mu;
public VariableArray<double> pi;
public VariableArray<double> alpha;
// Priors - these are declared as distribution variables
// so that we can set them at run-time. They are variables
// from the perspective of the 'Random' factor which takes
// a distribution as an argument.
public Variable<Gamma> priorAlpha = null;
public Variable<Gaussian> priorMu = null;
public Variable<Gamma> priorPi = null;
public Variable<Gamma> priorAlpha;
public Variable<Gaussian> priorMu;
public Variable<Gamma> priorPi;
// Model ranges
public Range rN = null;
public Range rD = null;
public Range rM = null;
public Range observation;
public Range feature;
public Range component;
/// <summary>
/// Model constructor
@ -49,53 +54,55 @@ namespace Microsoft.ML.Probabilistic.Tutorials
public BayesianPCAModel()
{
// The various dimensions will be set externally...
vN = Variable.New<int>().Named("NumObs");
vD = Variable.New<int>().Named("NumFeats");
vM = Variable.New<int>().Named("MaxComponents");
rN = new Range(vN).Named("N");
rD = new Range(vD).Named("D");
rM = new Range(vM).Named("M");
observationCount = Variable.New<int>().Named(nameof(observationCount));
featureCount = Variable.New<int>().Named(nameof(featureCount));
componentCount = Variable.New<int>().Named(nameof(componentCount));
observation = new Range(observationCount).Named(nameof(observation));
feature = new Range(featureCount).Named(nameof(feature));
component = new Range(componentCount).Named(nameof(component));
// ... as will the data
vData = Variable.Array<double>(rN, rD).Named("data");
data = Variable.Array<double>(observation, feature).Named(nameof(data));
// ... and the priors
priorAlpha = Variable.New<Gamma>().Named("PriorAlpha");
priorMu = Variable.New<Gaussian>().Named("PriorMu");
priorPi = Variable.New<Gamma>().Named("PriorPi");
priorAlpha = Variable.New<Gamma>().Named(nameof(priorAlpha));
priorMu = Variable.New<Gaussian>().Named(nameof(priorMu));
priorPi = Variable.New<Gamma>().Named(nameof(priorPi));
// Mixing matrix. Each row is drawn from a Gaussian with zero mean and
// a precision which will be learnt. This is a form of Automatic
// Relevance Determination (ARD). The larger the precisions become, the
// less important that row in the mixing matrix is in explaining the data
vAlpha = Variable.Array<double>(rM).Named("Alpha");
vW = Variable.Array<double>(rM, rD).Named("W");
vAlpha[rM] = Variable.Random<double, Gamma>(priorAlpha).ForEach(rM);
vW[rM, rD] = Variable.GaussianFromMeanAndPrecision(0, vAlpha[rM]).ForEach(rD);
alpha = Variable.Array<double>(component).Named(nameof(alpha));
W = Variable.Array<double>(component, feature).Named(nameof(W));
alpha[component] = Variable<double>.Random(priorAlpha).ForEach(component);
W[component, feature] = Variable.GaussianFromMeanAndPrecision(0, alpha[component]).ForEach(feature);
// Initialize the W marginal to break symmetry
initW = Variable.Array<Gaussian>(component, feature).Named(nameof(initW));
W[component, feature].InitialiseTo(initW[component, feature]);
// Latent variables are drawn from a standard Gaussian
vZ = Variable.Array<double>(rN, rM).Named("Z");
vZ[rN, rM] = Variable.GaussianFromMeanAndPrecision(0.0, 1.0).ForEach(rN, rM);
Z = Variable.Array<double>(observation, component).Named(nameof(Z));
Z[observation, component] = Variable.GaussianFromMeanAndPrecision(0.0, 1.0).ForEach(observation, component);
// Multiply the latent variables with the mixing matrix...
vT = Variable.MatrixMultiply(vZ, vW).Named("T");
T = Variable.MatrixMultiply(Z, W).Named(nameof(T));
// ... add in a bias ...
vMu = Variable.Array<double>(rD).Named("mu");
vMu[rD] = Variable.Random<double, Gaussian>(priorMu).ForEach(rD);
vU = Variable.Array<double>(rN, rD).Named("U");
vU[rN, rD] = vT[rN, rD] + vMu[rD];
mu = Variable.Array<double>(feature).Named(nameof(mu));
mu[feature] = Variable<double>.Random(priorMu).ForEach(feature);
U = Variable.Array<double>(observation, feature).Named(nameof(U));
U[observation, feature] = T[observation, feature] + mu[feature];
// ... and add in some observation noise ...
vPi = Variable.Array<double>(rD).Named("pi");
vPi[rD] = Variable.Random<double, Gamma>(priorPi).ForEach(rD);
pi = Variable.Array<double>(feature).Named(nameof(pi));
pi[feature] = Variable<double>.Random(priorPi).ForEach(feature);
// ... to give the likelihood of observing the data
vData[rN, rD] = Variable.GaussianFromMeanAndPrecision(vU[rN, rD], vPi[rD]);
data[observation, feature] = Variable.GaussianFromMeanAndPrecision(U[observation, feature], pi[feature]);
// Inference engine
engine = new InferenceEngine();
return;
}
}
@ -113,32 +120,32 @@ namespace Microsoft.ML.Probabilistic.Tutorials
Console.WriteLine("This example only runs with Variational Message Passing");
return;
}
// Set a stable random number seed for repeatable runs
Rand.Restart(12347);
double[,] data = generateData(1000);
// Set the data
bpca.vData.ObservedValue = data;
bpca.data.ObservedValue = data;
// Set the dimensions
bpca.vN.ObservedValue = data.GetLength(0);
bpca.vD.ObservedValue = data.GetLength(1);
bpca.vM.ObservedValue = 6;
bpca.observationCount.ObservedValue = data.GetLength(0);
bpca.featureCount.ObservedValue = data.GetLength(1);
bpca.componentCount.ObservedValue = 6;
// Set the priors
bpca.priorMu.ObservedValue = Gaussian.FromMeanAndPrecision(0.0, 0.01);
bpca.priorPi.ObservedValue = Gamma.FromShapeAndRate(2.0, 2.0);
bpca.priorAlpha.ObservedValue = Gamma.FromShapeAndRate(2.0, 2.0);
// Initialize the W marginal to break symmetry
bpca.vW.InitialiseTo(randomGaussianArray(bpca.vM.ObservedValue, bpca.vD.ObservedValue));
// Set the initialization
bpca.initW.ObservedValue = randomGaussianArray(bpca.componentCount.ObservedValue, bpca.featureCount.ObservedValue);
// Infer the marginals
bpca.engine.NumberOfIterations = 200;
Gaussian[,] inferredW = bpca.engine.Infer<Gaussian[,]>(bpca.vW);
Gaussian[] inferredMu = bpca.engine.Infer<Gaussian[]>(bpca.vMu);
Gamma[] inferredPi = bpca.engine.Infer<Gamma[]>(bpca.vPi);
var inferredW = bpca.engine.Infer<IArray2D<Gaussian>>(bpca.W);
var inferredMu = bpca.engine.Infer<IReadOnlyList<Gaussian>>(bpca.mu);
var inferredPi = bpca.engine.Infer<IReadOnlyList<Gamma>>(bpca.pi);
// Print out the results
Console.WriteLine("Inferred W:");
@ -148,18 +155,18 @@ namespace Microsoft.ML.Probabilistic.Tutorials
Console.Write(" True bias: ");
printVectorToConsole(trueMu);
Console.Write("Inferred bias: ");
printVectorToConsole(inferredMu);
printVectorToConsole(inferredMu.Select(d => d.GetMean()));
Console.Write(" True noise:");
printVectorToConsole(truePi);
Console.Write("Inferred noise:");
printVectorToConsole(inferredPi);
printVectorToConsole(inferredPi.Select(d => d.GetMean()));
Console.WriteLine();
}
/// <summary>
/// True W. Inference will find a different basis
/// </summary>
static double[,] trueW =
static double[,] trueW =
{
{ -0.30, 0.40, 0.20, -0.15, 0.20, -0.25, -0.50, -0.10, -0.25, 0.10 },
{ -0.10, -0.20, 0.40, 0.50, 0.15, -0.35, 0.05, 0.20, 0.20, -0.15 },
@ -206,7 +213,6 @@ namespace Microsoft.ML.Probabilistic.Tutorials
data[i, j] = Gaussian.Sample(u, truePi[j]);
}
}
return data;
}
@ -216,18 +222,9 @@ namespace Microsoft.ML.Probabilistic.Tutorials
/// <param name="row">Number of rows</param>
/// <param name="col">Number of columns</param>
/// <returns>The array as a distribution over a 2-D double array domain</returns>
private static IDistribution<double[,]> randomGaussianArray(int row, int col)
private static Gaussian[,] randomGaussianArray(int row, int col)
{
Gaussian[,] array = new Gaussian[row, col];
for (int i = 0; i < row; i++)
{
for (int j = 0; j < col; j++)
{
array[i, j] = Gaussian.FromMeanAndVariance(Rand.Normal(), 1);
}
}
return Distribution<double>.Array(array);
return Util.ArrayInit(row, col, (i, j) => Gaussian.FromMeanAndVariance(Rand.Normal(), 1));
}
/// <summary>
@ -235,7 +232,7 @@ namespace Microsoft.ML.Probabilistic.Tutorials
/// </summary>
/// <param name="matrix"></param>
/// <returns></returns>
private static double[] meanAbsoluteRowMeans(Gaussian[,] matrix)
private static double[] meanAbsoluteRowMeans(IArray2D<Gaussian> matrix)
{
double[] mam = new double[matrix.GetLength(0)];
double mult = 1.0 / ((double)matrix.GetLength(1));
@ -246,10 +243,8 @@ namespace Microsoft.ML.Probabilistic.Tutorials
{
sum += System.Math.Abs(matrix[i, j].GetMean());
}
mam[i] = mult * sum;
}
return mam;
}
@ -257,7 +252,7 @@ namespace Microsoft.ML.Probabilistic.Tutorials
/// Print the means of a 2-D array of Gaussians to the console
/// </summary>
/// <param name="matrix"></param>
private static void printMatrixToConsole(Gaussian[,] matrix)
private static void printMatrixToConsole(IArray2D<Gaussian> matrix)
{
for (int i = 0; i < matrix.GetLength(0); i++)
{
@ -265,67 +260,20 @@ namespace Microsoft.ML.Probabilistic.Tutorials
{
Console.Write("{0,5:0.00}\t", matrix[i, j].GetMean());
}
Console.WriteLine("");
}
}
/// <summary>
/// Print a 2-D double array to the console
/// </summary>
/// <param name="matrix"></param>
private static void printMatrixToConsole(double[,] matrix)
{
for (int i = 0; i < matrix.GetLength(0); i++)
{
for (int j = 0; j < matrix.GetLength(1); j++)
{
Console.Write("{0,5:0.00}\t", matrix[i, j]);
}
Console.WriteLine("");
}
}
/// <summary>
/// Print the means of a 1-D array of Gaussians to the console
/// </summary>
/// <param name="vector"></param>
private static void printVectorToConsole(Gaussian[] vector)
{
for (int i = 0; i < vector.GetLength(0); i++)
{
Console.Write("{0,5:0.00}\t", vector[i].GetMean());
}
Console.WriteLine("");
}
/// <summary>
/// Print the means of a 1-D array of Gammas to the console
/// </summary>
/// <param name="vector"></param>
private static void printVectorToConsole(Gamma[] vector)
{
for (int i = 0; i < vector.GetLength(0); i++)
{
Console.Write("{0,5:0.00}\t", vector[i].GetMean());
}
Console.WriteLine("");
}
/// <summary>
/// Print a 1-D double array to the console
/// </summary>
/// <param name="vector"></param>
private static void printVectorToConsole(double[] vector)
private static void printVectorToConsole(IEnumerable<double> vector)
{
for (int i = 0; i < vector.GetLength(0); i++)
foreach (var item in vector)
{
Console.Write("{0,5:0.00}\t", vector[i]);
Console.Write("{0,5:0.00}\t", item);
}
Console.WriteLine("");
}
}

Просмотреть файл

@ -0,0 +1,212 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Microsoft.ML.Probabilistic.Distributions;
using Microsoft.ML.Probabilistic.Models;
namespace Microsoft.ML.Probabilistic.Tutorials
{
/// <summary>
/// Example of learning causal relationships from data using gates in Infer.NET.
///
/// In this example, we consider two Boolean variables, A and B, and attempt to
/// distinguish whether A causes B or vice versa, through the use of data
/// with or without interventions on B.
/// </summary>
[Example("Applications", "Learning causal relationships")]
public class CausalityExample
{
public void Run()
{
// ***************** Experiment configuration ******************
// Number of data points
int numberOfDataPoints = 20;
// Noise parameter - defines the true strength of the association between A and B
// This ranges from 0.0 (meaning that A and B are equal) to 0.5
// (meaning that A and B are uncorrelated).
double q = 0.1;
// How we choose to set B in an intervention e.g. 0.5 is by a coin flip,
// This is a chosen parameter of our randomized study.
double probBIntervention = 0.5;
// ********************** Model definition *********************
// Now we write the Infer.NET model to compare between A causing B and B causing A
// - in this example we only consider these two possibilities.
//
// Gates are used to select between the two possibilities and to represent
// perfect interventions. In Infer.NET gates are represented as stochastic if
// statements created using Variable.If() and Variable.IfNot().
// Uniform prior over our two hypotheses
// (True = A causes B, False = B causes A)
var AcausesB = Variable.Bernoulli(0.5);
// Range across the data
var N = new Range(numberOfDataPoints);
// Set up array variables for the data
var A = Variable.Array<bool>(N).Named("A");
var B = Variable.Array<bool>(N).Named("B");
var doB = Variable.Array<bool>(N).Named("doB");
// Loop over the data points
using (Variable.ForEach(N))
{
// Intervention case - this is the same for either model
// defined once here.
using (Variable.If(doB[N]))
{
// Given intervention B is selected at random
// using a known parameter e.g. 0.5.
B[N] = Variable.Bernoulli(probBIntervention);
}
}
// *** First model: that A causes B ***
using (Variable.If(AcausesB))
{
// Loop over the data points
using (Variable.ForEach(N))
{
// Draw A from uniform prior
A[N] = Variable.Bernoulli(0.5);
// No intervention case for the A causes B model
using (Variable.IfNot(doB[N]))
{
// Set B to a noisy version of A
B[N] = A[N] != (Variable.Bernoulli(q));
}
}
}
// *** Second model: that B causes A ***
using (Variable.IfNot(AcausesB))
{
// Loop over the data points
using (Variable.ForEach(N))
{
// No intervention case for the B causes A model
using (Variable.IfNot(doB[N]))
{
// Draw B from uniform prior
B[N] = Variable.Bernoulli(0.5);
}
// Set A to a noisy version of B
A[N] = B[N] != (Variable.Bernoulli(q));
}
}
// ************************* Inference *************************
// Create an Infer.NET inference engine
var engine = new InferenceEngine();
Console.WriteLine("Causal inference using gates in Infer.NET");
Console.WriteLine("=========================================\r\n");
Console.WriteLine("Data set of " + numberOfDataPoints + " data points with noise " + q + "\r\n");
// *** Data without interventions ***
// Generate data set
var dataWithoutInterventions = GenerateFromTrueModel(numberOfDataPoints, q, false, probBIntervention);
// Attach the data without interventions
A.ObservedValue = dataWithoutInterventions.A;
B.ObservedValue = dataWithoutInterventions.B;
doB.ObservedValue = dataWithoutInterventions.doB;
// Infer probability that A causes B (rather than B causes A)
Bernoulli AcausesBdist = engine.Infer<Bernoulli>(AcausesB);
Console.WriteLine("P(A causes B), without interventions=" + AcausesBdist.GetProbTrue() + "\r\n");
// *** Data WITH interventions ***
// Number of inference runs to average over (each with a different generated data set)
int numberOfRuns = 10;
Console.WriteLine("Executing " + numberOfRuns + " runs with interventions:");
double tot = 0;
for (int i = 0; i < numberOfRuns; i++)
{
// Generate data with interventions
var dataWithInterventions = GenerateFromTrueModel(numberOfDataPoints, q, true, probBIntervention);
// Attach the data with interventions (this replaces any previously attached data)
A.ObservedValue = dataWithInterventions.A;
B.ObservedValue = dataWithInterventions.B;
doB.ObservedValue = dataWithInterventions.doB;
// Infer probability that A causes B (rather than B causes A)
Bernoulli AcausesBdist2 = engine.Infer<Bernoulli>(AcausesB);
tot += AcausesBdist2.GetProbTrue();
Console.WriteLine("{0,4}. P(A causes B)={1}", i + 1, (float)AcausesBdist2.GetProbTrue());
}
Console.WriteLine("Average P(A causes B), with interventions=" + (float)(tot / numberOfRuns));
}
/// <summary>
/// Generates data from the true model: A cause B
/// </summary>
/// <param name="N">Number of data points to generate</param>
/// <param name="q">Noise (flip) probability</param>
/// <param name="doB">Whether to intervene or not</param>
/// <param name="probBIntervention">Prob of choosing B=true when intervening</param>
/// <returns></returns>
private static Data GenerateFromTrueModel(int N, double q, bool doB, double probBIntervention)
{
// Create data object to fill with data.
Data d = new Data { A = new bool[N], B = new bool[N], doB = new bool[N] };
// Uniform prior on A
var Aprior = new Bernoulli(0.5);
// Noise distribution
var flipDist = new Bernoulli(q);
// Distribution over the values of B when we intervene
var interventionDist = new Bernoulli(probBIntervention);
// Loop over data
for (int i = 0; i < N; i++)
{
// Draw A from prior
d.A[i] = Aprior.Sample();
// Whether we intervened on B
// This is currently the same for all data points - but could easily be modified.
d.doB[i] = doB;
if (!d.doB[i])
{
// We are not intervening so use the causal model i.e.
// make B a noisy version of A - flipping it with probability q
d.B[i] = d.A[i] != flipDist.Sample();
}
else
{
// We are intervening - setting B according to a coin flip
d.B[i] = interventionDist.Sample();
}
}
return d;
}
}
/// <summary>
/// Class to store the data
/// </summary>
class Data
{
public bool[] A; // observations of A
public bool[] B; // observations of B
public bool[] doB; // whether we intervened to set B
}
}

Просмотреть файл

@ -73,6 +73,9 @@
<Compile Update="BugsRats.cs">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Compile>
<Compile Update="CausalityExample.cs">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Compile>
<Compile Update="ChessAnalysis.cs">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Compile>

Просмотреть файл

@ -1,3 +0,0 @@
<?xml version="1.0"?>
<configuration>
<startup><supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2"/></startup></configuration>

Просмотреть файл

@ -1,6 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2"/>
</startup>
</configuration>

Просмотреть файл

@ -1,10 +0,0 @@
<?xml version="1.0"?>
<configuration>
<startup><supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2"/></startup>
<system.diagnostics>
<switches>
<!-- This line instructs XmlSerializer to leave its generated code in the TEMP directory so that you can debug it -->
<!-- <add name="XmlSerialization.Compilation" value="1" /> -->
</switches>
</system.diagnostics>
</configuration>

Просмотреть файл

@ -84,6 +84,7 @@ namespace Microsoft.ML.Probabilistic.Tests
InferenceEngine engine = new InferenceEngine(new VariationalMessagePassing());
engine.Compiler.DeclarationProvider = RoslynDeclarationProvider.Instance;
var ca = engine.Compiler.Compile(LoopStartError);
ca.Execute(50);
Assert.True(false, "Did not throw exception");
}
catch (CompilationFailedException tfe)
@ -97,8 +98,8 @@ namespace Microsoft.ML.Probabilistic.Tests
private void LoopStartError()
{
double[] array = new double[4]; // invalid number of dims
for (int l = 1; l < 4; l++)
double[] array = new double[4];
for (int l = 1; l < 4; l++) // loop starts at 1 instead of 0
{
array[l] = Factor.Random(Gamma.FromShapeAndScale(1, 1));
}