added DecisionService HTML scrape

This commit is contained in:
Markus Cozowicz 2017-05-26 21:04:02 -04:00
Родитель ae5c30fcac
Коммит d120c132c9
33 изменённых файлов: 2198 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,64 @@
using System.Threading.Tasks;
using Microsoft.Azure.WebJobs.Host;
using System.Net.Http;
using Newtonsoft.Json;
using Microsoft.DecisionService.Crawl.Data;
using Newtonsoft.Json.Linq;
using System.Linq;
using System;
using System.Collections.Generic;
using System.Text;
using System.Net.Http.Headers;
using System.Threading;
namespace Microsoft.DecisionService.Crawl
{
public class AzureMLTopic
{
private static readonly HttpCachedService cachedService;
static AzureMLTopic()
{
cachedService = new HttpCachedService("AzureMLTopic");
cachedService.client.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("Bearer", cachedService.apiKey);
}
public static Task<HttpResponseMessage> Run(HttpRequestMessage req, TraceWriter log, CancellationToken cancellationToken)
{
return cachedService.InvokeAsync(req, log,
reqBody =>
{
var scoreRequest = new
{
Inputs = new Dictionary<string, StringTable>(),
GlobalParameters = new Dictionary<string, string>() { }
};
scoreRequest.Inputs.Add("input1", new StringTable
{
ColumnNames = new string[] { "Text" },
Values = new string[,] { { reqBody.Article } }
});
return scoreRequest;
},
(reqBody, blobContent) =>
{
blobContent.Output = new JObject();
var jobj = JObject.Parse(blobContent.Value);
var topicRemoteRaw = jobj.SelectToken("$.Results.output1.value.Values[0][0]");
if (topicRemoteRaw != null)
blobContent.Output.Add(new JProperty("topics", topicRemoteRaw.Value<string>().Split(',').Select(float.Parse).ToArray()));
},
cancellationToken);
}
public class StringTable
{
public string[] ColumnNames { get; set; }
public string[,] Values { get; set; }
}
}
}

Просмотреть файл

@ -0,0 +1,18 @@
{
"scriptFile": "..\\bin\\Crawl.dll",
"entryPoint": "Microsoft.DecisionService.Crawl.AzureMLTopic.Run",
"bindings": [
{
"authLevel": "function",
"name": "req",
"type": "httpTrigger",
"direction": "in"
},
{
"name": "$return",
"type": "http",
"direction": "out"
}
],
"disabled": false
}

124
Crawl/BlobCache.cs Normal file
Просмотреть файл

@ -0,0 +1,124 @@
//------------------------------------------------------------------------------
// <copyright company="Microsoft Corporation">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
using Microsoft.WindowsAzure.Storage;
using Microsoft.WindowsAzure.Storage.Blob;
using Newtonsoft.Json;
using System;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace Microsoft.DecisionService.Crawl
{
public sealed class BlobCache
{
private readonly CloudBlobClient blobClient;
public BlobCache(string storageConnectionString)
{
var account = CloudStorageAccount.Parse(storageConnectionString);
this.blobClient = account.CreateCloudBlobClient();
}
private async Task<CloudBlobContainer> GetContainer(DateTime now, string service)
{
var container = this.blobClient.GetContainerReference($"{now:yyyyMM}{service}".ToLowerInvariant());
await container.CreateIfNotExistsAsync();
return container;
}
private string ToBlobName(string site, string id)
{
// escape for blob name
id = id.Replace("//", "__")
.Replace(":", "_");
// https://docs.microsoft.com/en-us/rest/api/storageservices/fileservices/naming-and-referencing-containers--blobs--and-metadata
var maxIdLength = 1024 - (site.Length + 1);
if (id.Length > maxIdLength)
id = id.Substring(0, maxIdLength);
// <site>/<url>
var sb = new StringBuilder();
sb.Append(site);
if (!id.StartsWith("/"))
sb.Append('/');
sb.Append(id);
return sb.ToString();
}
public async Task<BlobContent> GetAsync(string site, string id, string service, string input, TimeSpan refreshTimeSpan, CancellationToken cancellationToken)
{
var now = DateTime.UtcNow;
CacheItem cacheItem = null;
CloudBlockBlob currentBlob = null;
for (int i = 0; i < 2 && cacheItem == null; i++)
{
var container = await this.GetContainer(now.AddMonths(-i), service);
var blobName = this.ToBlobName(site, id);
var blob = container.GetBlockBlobReference(blobName);
if (currentBlob == null)
currentBlob = blob;
// TODO: CreateIfNotExists() and check for empty
if (await blob.ExistsAsync())
{
var json = await blob.DownloadTextAsync(cancellationToken);
cacheItem = JsonConvert.DeserializeObject<CacheItem>(json);
// replicate in current month
if (i > 0)
await currentBlob.UploadTextAsync(json, cancellationToken);
// if it isn't up for refresh, just return the existing
if (cacheItem.NextRefreshTimestamp > DateTime.UtcNow)
return new BlobContent
{
Value = cacheItem.Output,
Expires = cacheItem.NextRefreshTimestamp
};
}
}
if (cacheItem == null)
cacheItem = new CacheItem();
cacheItem.Input = input;
cacheItem.NextRefreshTimestamp = DateTime.UtcNow + refreshTimeSpan;
await currentBlob.UploadTextAsync(JsonConvert.SerializeObject(cacheItem), cancellationToken);
return null;
}
public async Task<BlobContent> PersistAsync(string site, string id, string service, string input, string output, TimeSpan refreshTimeSpan, CancellationToken cancellationToken)
{
var container = await this.GetContainer(DateTime.UtcNow, service);
var blobName = this.ToBlobName(site, id);
var blob = container.GetBlockBlobReference(blobName);
var cacheItem = new CacheItem
{
NextRefreshTimestamp = DateTime.UtcNow + refreshTimeSpan,
// put input in there to to be consistent
Input = input,
Output = output
};
await blob.UploadTextAsync(JsonConvert.SerializeObject(cacheItem), cancellationToken);
return new BlobContent
{
Value = output,
Expires = cacheItem.NextRefreshTimestamp
};
}
}
}

20
Crawl/BlobContent.cs Normal file
Просмотреть файл

@ -0,0 +1,20 @@
//------------------------------------------------------------------------------
// <copyright company="Microsoft Corporation">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
using Newtonsoft.Json.Linq;
using System;
namespace Microsoft.DecisionService.Crawl
{
public sealed class BlobContent
{
public string Value { get; set; }
public DateTime Expires { get; set; }
public JObject Output { get; set; }
}
}

25
Crawl/CacheItem.cs Normal file
Просмотреть файл

@ -0,0 +1,25 @@
//------------------------------------------------------------------------------
// <copyright company="Microsoft Corporation">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
using Newtonsoft.Json;
using System;
namespace Microsoft.DecisionService.Crawl
{
public sealed class CacheItem
{
[JsonProperty("nextRefreshTimestamp")]
public DateTime NextRefreshTimestamp { get; set; }
[JsonProperty("input")]
[JsonConverter(typeof(RawStringConverter))]
public string Input { get; set; }
[JsonProperty("output")]
[JsonConverter(typeof(RawStringConverter))]
public string Output { get; set; }
}
}

32
Crawl/CertificateUtil.cs Normal file
Просмотреть файл

@ -0,0 +1,32 @@
using System;
using System.Linq;
using System.Security.Cryptography.X509Certificates;
namespace Microsoft.DecisionService.Crawl
{
public static class CertificateUtil
{
public static X509Certificate2 FindCertificateByThumbprint(StoreLocation storeLocation, string thumbprint)
{
X509Store store = new X509Store(StoreName.My, storeLocation);
try
{
store.Open(OpenFlags.ReadOnly);
X509Certificate2Collection col = store.Certificates.Find(
X509FindType.FindByThumbprint,
thumbprint,
validOnly:false); // Don't validate certs as they're self-signed
if (col == null || col.Count == 0)
{
var availableCertThumbprints = string.Join(",", store.Certificates.OfType<X509Certificate2>().Select(c => c.Thumbprint));
throw new Exception($"Cannot find certificate in My\\{storeLocation} with thumbprint '{thumbprint}'. Available certs are {availableCertThumbprints}");
}
return col[0];
}
finally
{
store.Close();
}
}
}
}

33
Crawl/CognitiveService.cs Normal file
Просмотреть файл

@ -0,0 +1,33 @@
//------------------------------------------------------------------------------
// <copyright company="Microsoft Corporation">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
using System;
using System.Threading.Tasks;
namespace Microsoft.DecisionService.Crawl
{
public class CognitiveService : HttpCachedService
{
private readonly string queryParams;
public CognitiveService(string containerName, string queryParams = null) : base(containerName)
{
this.queryParams = queryParams;
}
protected override void Initialize()
{
// TODO: need to re-create client (can't just update base address if the key changes...)
//if (this.client.DefaultRequestHeaders.Contains("Ocp-Apim-Subscription-Key"))
// this.client.DefaultRequestHeaders.Remove("Ocp-Apim-Subscription-Key");
this.client.DefaultRequestHeaders.Add("Ocp-Apim-Subscription-Key", apiKey);
if (!string.IsNullOrEmpty(queryParams))
this.client.BaseAddress = new Uri(this.client.BaseAddress.ToString() + queryParams);
}
}
}

Просмотреть файл

@ -0,0 +1,73 @@
//------------------------------------------------------------------------------
// <copyright company="Microsoft Corporation">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
using System.Threading.Tasks;
using Microsoft.Azure.WebJobs.Host;
using System.Net.Http;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using System.Linq;
using System.Text;
using System.Threading;
namespace Microsoft.DecisionService.Crawl
{
public class CognitiveServiceEntityLinking
{
private static readonly CognitiveService cogService;
static CognitiveServiceEntityLinking()
{
cogService = new CognitiveService("CogEntityLinking");
}
public static Task<HttpResponseMessage> Run(HttpRequestMessage req, TraceWriter log, CancellationToken cancellationToken)
{
return cogService.InvokeAsync(req, log,
reqBody =>
{
var textBuilder = new StringBuilder();
if (!string.IsNullOrEmpty(reqBody.Title))
textBuilder.AppendLine(reqBody.Title);
if (!string.IsNullOrEmpty(reqBody.Article))
textBuilder.AppendLine(reqBody.Article);
return Services.Limit(textBuilder.ToString(), 10240);
},
(reqBody, blobContent) =>
{
blobContent.Output = new JObject();
var entityResponse = JsonConvert.DeserializeObject<EntityResponse>(blobContent.Value);
if (entityResponse?.Entities != null)
{
var q = entityResponse.Entities
.GroupBy(e => e.Name)
.Select(e => new JProperty(e.Key, e.Max(x => x.Score)));
blobContent.Output.Add("Tags", new JObject(q));
}
},
cancellationToken);
}
public class EntityResponse
{
[JsonProperty("entities")]
public Entity[] Entities { get; set; }
}
public class Entity
{
[JsonProperty("name")]
public string Name { get; set; }
[JsonProperty("score")]
public float Score { get; set; }
}
}
}

Просмотреть файл

@ -0,0 +1,18 @@
{
"scriptFile": "..\\bin\\Crawl.dll",
"entryPoint": "Microsoft.DecisionService.Crawl.CognitiveServiceEntityLinking.Run",
"bindings": [
{
"authLevel": "function",
"name": "req",
"type": "httpTrigger",
"direction": "in"
},
{
"name": "$return",
"type": "http",
"direction": "out"
}
],
"disabled": false
}

Просмотреть файл

@ -0,0 +1,115 @@
//------------------------------------------------------------------------------
// <copyright company="Microsoft Corporation">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
using System.Threading.Tasks;
using Microsoft.Azure.WebJobs.Host;
using System.Net.Http;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using System.Collections.Generic;
using System.Text;
using System.Threading;
namespace Microsoft.DecisionService.Crawl
{
public class CognitiveServiceTextAnalytics
{
private static readonly CognitiveService cogService;
static CognitiveServiceTextAnalytics()
{
cogService = new CognitiveService("CogTextAnalytics");
}
public static Task<HttpResponseMessage> Run(HttpRequestMessage req, TraceWriter log, CancellationToken cancellationToken)
{
return cogService.InvokeAsync(req, log,
reqBody =>
{
var textBuilder = new StringBuilder();
if (!string.IsNullOrEmpty(reqBody.Title))
textBuilder.AppendLine(reqBody.Title);
if (!string.IsNullOrEmpty(reqBody.Article))
textBuilder.AppendLine(reqBody.Article);
var text = textBuilder.ToString();
// Based on email thread with Arvind Krishnaa Jagannathan <arjagann@microsoft.com>
if (text.Length >= 10240 / 2)
text = text.Substring(0, 10240 / 2);
return new TextAnalyticRequest
{
Documents = new List<TextAnalyticDocument>
{
new TextAnalyticDocument
{
//Language = "english",
Text = text,
Id = "1"
}
}
};
},
(reqBody, blobContent) =>
{
blobContent.Output = new JObject();
var responseObj = JsonConvert.DeserializeObject<TextAnalyticResponse>(blobContent.Value);
if (responseObj?.Documents?.Length == 1)
blobContent.Output.Add(new JProperty("XSentiment", responseObj.Documents[0].Score));
},
cancellationToken);
}
public class TextAnalyticRequest
{
[JsonProperty("documents")]
public List<TextAnalyticDocument> Documents { get; set; }
}
public class TextAnalyticDocument
{
[JsonProperty("language", NullValueHandling = NullValueHandling.Ignore)]
public string Language { get; set; }
[JsonProperty("id")]
public string Id { get; set; }
[JsonProperty("text")]
public string Text { get; set; }
}
public class TextAnalyticResponse
{
[JsonProperty("documents")]
public TextAnalyticResponseDocument[] Documents { get; set; }
[JsonProperty("errors")]
public TextAnalyticResponseError[] Errors { get; set; }
}
public class TextAnalyticResponseDocument
{
[JsonProperty("id")]
public string Id { get; set; }
[JsonProperty("score")]
public float Score { get; set; }
}
public class TextAnalyticResponseError
{
[JsonProperty("id")]
public string Id { get; set; }
[JsonProperty("message")]
public string Message { get; set; }
}
}
}

Просмотреть файл

@ -0,0 +1,18 @@
{
"scriptFile": "..\\bin\\Crawl.dll",
"entryPoint": "Microsoft.DecisionService.Crawl.CognitiveServiceTextAnalytics.Run",
"bindings": [
{
"authLevel": "function",
"name": "req",
"type": "httpTrigger",
"direction": "in"
},
{
"name": "$return",
"type": "http",
"direction": "out"
}
],
"disabled": false
}

Просмотреть файл

@ -0,0 +1,151 @@
//------------------------------------------------------------------------------
// <copyright company="Microsoft Corporation">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
using System.Threading.Tasks;
using Microsoft.Azure.WebJobs.Host;
using System.Net.Http;
using Newtonsoft.Json;
using Crawl.Data;
using Crawl;
using Newtonsoft.Json.Linq;
using System.Linq;
using System.Threading;
namespace Microsoft.DecisionService.Crawl
{
public class CognitiveServiceVision
{
private static readonly CognitiveService cogService;
static CognitiveServiceVision()
{
cogService = new CognitiveService("CogVision", queryParams: "?visualFeatures=Categories,Tags,Adult,Faces&details=Celebrities&language=en");
}
public static async Task<HttpResponseMessage> Run(HttpRequestMessage req, TraceWriter log, CancellationToken cancellationToken)
{
return await cogService.InvokeAsync(req, log,
reqBody => new UrlHolder { Url = reqBody.Image },
(reqBody, blobContent) =>
{
var visionResponse = JsonConvert.DeserializeObject<VisionResponse>(blobContent.Value);
// multiple namespaces
blobContent.Output = new JObject();
// R,S,T,U
if (visionResponse.Tags != null)
blobContent.Output.Add(
new JProperty("RVisionTags",
new JObject(
visionResponse.Tags.Select(t => new JProperty(t.Name, t.Confidence)))));
if (visionResponse.Adult != null)
blobContent.Output.Add(
new JProperty("SVisionAdult",
JObject.Parse(JsonConvert.SerializeObject(visionResponse.Adult))));
if (visionResponse.Categories != null)
{
// not for now
//output.Add(
// new JProperty("TVisionCategories",
// new JObject(
// visionResponse.Categories.Select(t => new JProperty(t.Name, t.Score)))));
var celebs =
visionResponse.Categories
.Where(c => c.Detail != null && c.Detail.Celebrities != null)
.SelectMany(c => c.Detail.Celebrities)
.GroupBy(c => c.Name)
.ToList();
if (celebs.Count > 0)
blobContent.Output.Add(
new JProperty("TVisionCelebrities",
new JObject(
celebs.Select(t => new JProperty(t.Key, t.Max(x => x.Confidence))))));
}
},
cancellationToken);
}
public class VisionResponse
{
[JsonProperty("categories")]
public Category[] Categories { get; set; }
[JsonProperty("adult")]
public Adult Adult { get; set; }
[JsonProperty("tags")]
public Tag[] Tags { get; set; }
[JsonProperty("faces")]
public Face[] Faces { get; set; }
}
public class Category
{
[JsonProperty("name")]
public string Name { get; set; }
[JsonProperty("score")]
public float Score { get; set; }
[JsonProperty("detail")]
public CategoryDetail Detail { get; set; }
}
public class CategoryDetail
{
[JsonProperty("celebrities")]
public Celebrity[] Celebrities { get; set; }
}
public class Celebrity
{
[JsonProperty("name")]
public string Name { get; set; }
[JsonProperty("confidence")]
public float Confidence { get; set; }
}
public class Adult
{
[JsonProperty("isAdultContent")]
public bool IsAdultContent { get; set; }
[JsonProperty("isRacyContent")]
public bool IsRacyContent { get; set; }
[JsonProperty("adultScore")]
public float AdultScore { get; set; }
[JsonProperty("racyScore")]
public float RacyScore { get; set; }
}
public class Tag
{
[JsonProperty("name")]
public string Name { get; set; }
[JsonProperty("confidence")]
public float Confidence { get; set; }
}
public class Face
{
[JsonProperty("age")]
public int Age { get; set; }
[JsonProperty("gender")]
public string Gender { get; set; }
}
}
}

Просмотреть файл

@ -0,0 +1,18 @@
{
"scriptFile": "..\\bin\\Crawl.dll",
"entryPoint": "Microsoft.DecisionService.Crawl.CognitiveServiceVision.Run",
"bindings": [
{
"authLevel": "function",
"name": "req",
"type": "httpTrigger",
"direction": "in"
},
{
"name": "$return",
"type": "http",
"direction": "out"
}
],
"disabled": false
}

224
Crawl/Crawl.csproj Normal file
Просмотреть файл

@ -0,0 +1,224 @@
<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="..\packages\Microsoft.CodeDom.Providers.DotNetCompilerPlatform.1.0.4\build\net45\Microsoft.CodeDom.Providers.DotNetCompilerPlatform.props" Condition="Exists('..\packages\Microsoft.CodeDom.Providers.DotNetCompilerPlatform.1.0.4\build\net45\Microsoft.CodeDom.Providers.DotNetCompilerPlatform.props')" />
<Import Project="..\packages\Microsoft.Net.Compilers.2.1.0\build\Microsoft.Net.Compilers.props" Condition="Exists('..\packages\Microsoft.Net.Compilers.2.1.0\build\Microsoft.Net.Compilers.props')" />
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProductVersion>
</ProductVersion>
<SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>{28285B58-63A5-48F9-99DA-7498E0E3AB22}</ProjectGuid>
<ProjectTypeGuids>{349c5851-65df-11da-9384-00065b846f21};{fae04ec0-301f-11d3-bf4b-00c04f79efbc}</ProjectTypeGuids>
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>Crawl</RootNamespace>
<AssemblyName>Crawl</AssemblyName>
<TargetFrameworkVersion>v4.6.2</TargetFrameworkVersion>
<UseIISExpress>true</UseIISExpress>
<IISExpressSSLPort />
<IISExpressAnonymousAuthentication />
<IISExpressWindowsAuthentication />
<IISExpressUseClassicPipelineMode />
<UseGlobalApplicationHostFile />
<NuGetPackageImportStamp>
</NuGetPackageImportStamp>
<TargetFrameworkProfile />
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<PlatformTarget>AnyCPU</PlatformTarget>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<ExcludeGeneratedDebugSymbol>false</ExcludeGeneratedDebugSymbol>
<PlatformTarget>x64</PlatformTarget>
</PropertyGroup>
<ItemGroup>
<Reference Include="HtmlAgilityPack, Version=1.4.9.5, Culture=neutral, PublicKeyToken=bd319b19eaf3b43a, processorArchitecture=MSIL">
<HintPath>..\packages\HtmlAgilityPack.1.4.9.5\lib\Net45\HtmlAgilityPack.dll</HintPath>
</Reference>
<Reference Include="Microsoft.ApplicationInsights, Version=2.3.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
<HintPath>..\packages\Microsoft.ApplicationInsights.2.3.0\lib\net46\Microsoft.ApplicationInsights.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Azure.KeyVault, Version=2.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
<HintPath>..\packages\Microsoft.Azure.KeyVault.2.0.6\lib\net45\Microsoft.Azure.KeyVault.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Azure.KeyVault.Core, Version=2.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
<HintPath>..\packages\Microsoft.Azure.KeyVault.Core.2.0.4\lib\net45\Microsoft.Azure.KeyVault.Core.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Azure.KeyVault.WebKey, Version=2.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
<HintPath>..\packages\Microsoft.Azure.KeyVault.WebKey.2.0.5\lib\net452\Microsoft.Azure.KeyVault.WebKey.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Azure.WebJobs, Version=2.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
<HintPath>..\packages\Microsoft.Azure.WebJobs.Core.2.0.0\lib\net45\Microsoft.Azure.WebJobs.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Azure.WebJobs.Host, Version=2.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
<HintPath>..\packages\Microsoft.Azure.WebJobs.2.0.0\lib\net45\Microsoft.Azure.WebJobs.Host.dll</HintPath>
</Reference>
<Reference Include="Microsoft.CodeDom.Providers.DotNetCompilerPlatform, Version=1.0.4.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
<HintPath>..\packages\Microsoft.CodeDom.Providers.DotNetCompilerPlatform.1.0.4\lib\net45\Microsoft.CodeDom.Providers.DotNetCompilerPlatform.dll</HintPath>
</Reference>
<Reference Include="Microsoft.CSharp" />
<Reference Include="Microsoft.Data.Edm, Version=5.8.1.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
<HintPath>..\packages\Microsoft.Data.Edm.5.8.2\lib\net40\Microsoft.Data.Edm.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Data.OData, Version=5.8.1.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
<HintPath>..\packages\Microsoft.Data.OData.5.8.2\lib\net40\Microsoft.Data.OData.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Data.Services.Client, Version=5.8.1.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
<HintPath>..\packages\Microsoft.Data.Services.Client.5.8.2\lib\net40\Microsoft.Data.Services.Client.dll</HintPath>
</Reference>
<Reference Include="Microsoft.IdentityModel.Clients.ActiveDirectory, Version=3.13.9.1126, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
<HintPath>..\packages\Microsoft.IdentityModel.Clients.ActiveDirectory.3.13.9\lib\net45\Microsoft.IdentityModel.Clients.ActiveDirectory.dll</HintPath>
</Reference>
<Reference Include="Microsoft.IdentityModel.Clients.ActiveDirectory.Platform, Version=3.13.9.1126, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
<HintPath>..\packages\Microsoft.IdentityModel.Clients.ActiveDirectory.3.13.9\lib\net45\Microsoft.IdentityModel.Clients.ActiveDirectory.Platform.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Rest.ClientRuntime, Version=2.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
<HintPath>..\packages\Microsoft.Rest.ClientRuntime.2.3.7\lib\net452\Microsoft.Rest.ClientRuntime.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Rest.ClientRuntime.Azure, Version=3.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
<HintPath>..\packages\Microsoft.Rest.ClientRuntime.Azure.3.3.6\lib\net452\Microsoft.Rest.ClientRuntime.Azure.dll</HintPath>
</Reference>
<Reference Include="Microsoft.WindowsAzure.Storage, Version=8.1.1.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
<HintPath>..\packages\WindowsAzure.Storage.8.1.1\lib\net45\Microsoft.WindowsAzure.Storage.dll</HintPath>
</Reference>
<Reference Include="Newtonsoft.Json, Version=9.0.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
<HintPath>..\packages\Newtonsoft.Json.9.0.1\lib\net45\Newtonsoft.Json.dll</HintPath>
</Reference>
<Reference Include="System.Data.DataSetExtensions" />
<Reference Include="System.Net" />
<Reference Include="System.Net.Http">
</Reference>
<Reference Include="System.Net.Http.WebRequest" />
<Reference Include="System.Runtime.Serialization" />
<Reference Include="System.Security.Cryptography.Algorithms, Version=4.1.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
<HintPath>..\packages\System.Security.Cryptography.Algorithms.4.3.0\lib\net461\System.Security.Cryptography.Algorithms.dll</HintPath>
</Reference>
<Reference Include="System.Security.Cryptography.Encoding, Version=4.0.1.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
<HintPath>..\packages\System.Security.Cryptography.Encoding.4.3.0\lib\net46\System.Security.Cryptography.Encoding.dll</HintPath>
</Reference>
<Reference Include="System.Security.Cryptography.Primitives, Version=4.0.1.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
<HintPath>..\packages\System.Security.Cryptography.Primitives.4.3.0\lib\net46\System.Security.Cryptography.Primitives.dll</HintPath>
</Reference>
<Reference Include="System.Security.Cryptography.X509Certificates, Version=4.1.1.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
<HintPath>..\packages\System.Security.Cryptography.X509Certificates.4.3.0\lib\net461\System.Security.Cryptography.X509Certificates.dll</HintPath>
</Reference>
<Reference Include="System.Spatial, Version=5.8.1.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
<HintPath>..\packages\System.Spatial.5.8.2\lib\net40\System.Spatial.dll</HintPath>
</Reference>
<Reference Include="System.Web.DynamicData" />
<Reference Include="System.Web.Entity" />
<Reference Include="System.Web.ApplicationServices" />
<Reference Include="System.ComponentModel.DataAnnotations" />
<Reference Include="System" />
<Reference Include="System.Data" />
<Reference Include="System.Drawing" />
<Reference Include="System.Web" />
<Reference Include="System.Web.Extensions" />
<Reference Include="System.Xml" />
<Reference Include="System.Configuration" />
<Reference Include="System.Web.Services" />
<Reference Include="System.EnterpriseServices" />
<Reference Include="System.Xml.Linq" />
</ItemGroup>
<ItemGroup>
<Content Include="packages.config">
<SubType>Designer</SubType>
</Content>
<Content Include="Crawl\function.json" />
<Content Include="host.json" />
<Content Include="RSS\function.json" />
<Compile Include="AzureMLTopic\AzureMLTopic.cs" />
<Compile Include="BlobCache.cs" />
<Compile Include="BlobContent.cs" />
<Compile Include="CacheItem.cs" />
<Compile Include="CognitiveServiceEmotion\CognitiveServiceEmotion.cs" />
<Compile Include="CognitiveServiceEntityLinking\CognitiveServiceEntityLinking.cs" />
<Compile Include="CognitiveServiceTextAnalytics\CognitiveServiceTextAnalytics.cs" />
<Compile Include="HttpCachedService.cs" />
<Compile Include="RSS\RSS.cs" />
<Content Include="CognitiveServiceVision\function.json" />
<Content Include="CognitiveServiceEmotion\function.json" />
<Content Include="CognitiveServiceEntityLinking\function.json" />
<Content Include="CognitiveServiceTextAnalytics\function.json" />
<Content Include="AzureMLTopic\function.json" />
<None Include="Properties\PublishProfiles\DevProfile.pubxml" />
<None Include="Web.Debug.config">
<DependentUpon>Web.config</DependentUpon>
</None>
<None Include="Web.Release.config">
<DependentUpon>Web.config</DependentUpon>
</None>
</ItemGroup>
<ItemGroup>
<Content Include="Web.config">
<SubType>Designer</SubType>
</Content>
</ItemGroup>
<ItemGroup>
<Compile Include="CognitiveService.cs" />
<Compile Include="CognitiveServiceVision\CognitiveServiceVision.cs" />
<Compile Include="Crawl\HtmlExtractor.cs" />
<Compile Include="Crawl\Crawl.cs" />
<Compile Include="Data\CrawlRequest.cs" />
<Compile Include="Data\CrawlResponse.cs" />
<Compile Include="Data\UrlHolder.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="Services.cs" />
<Compile Include="KeyVaultHelper.cs" />
<Compile Include="CertificateUtil.cs" />
</ItemGroup>
<PropertyGroup>
<VisualStudioVersion Condition="'$(VisualStudioVersion)' == ''">10.0</VisualStudioVersion>
<VSToolsPath Condition="'$(VSToolsPath)' == ''">$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)</VSToolsPath>
</PropertyGroup>
<Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
<Import Project="$(VSToolsPath)\WebApplications\Microsoft.WebApplication.targets" Condition="'$(VSToolsPath)' != ''" />
<Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v10.0\WebApplications\Microsoft.WebApplication.targets" Condition="false" />
<ProjectExtensions>
<VisualStudio>
<FlavorProperties GUID="{349c5851-65df-11da-9384-00065b846f21}">
<WebProjectProperties>
<UseIIS>True</UseIIS>
<AutoAssignPort>True</AutoAssignPort>
<DevelopmentServerPort>33183</DevelopmentServerPort>
<DevelopmentServerVPath>/</DevelopmentServerVPath>
<IISUrl>http://localhost:33183/</IISUrl>
<NTLMAuthentication>False</NTLMAuthentication>
<UseCustomServer>False</UseCustomServer>
<CustomServerUrl>
</CustomServerUrl>
<SaveServerSettingsInUserFile>False</SaveServerSettingsInUserFile>
</WebProjectProperties>
</FlavorProperties>
</VisualStudio>
</ProjectExtensions>
<Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
<PropertyGroup>
<ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
</PropertyGroup>
<Error Condition="!Exists('..\packages\Microsoft.Net.Compilers.2.1.0\build\Microsoft.Net.Compilers.props')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\Microsoft.Net.Compilers.2.1.0\build\Microsoft.Net.Compilers.props'))" />
<Error Condition="!Exists('..\packages\Microsoft.CodeDom.Providers.DotNetCompilerPlatform.1.0.4\build\net45\Microsoft.CodeDom.Providers.DotNetCompilerPlatform.props')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\Microsoft.CodeDom.Providers.DotNetCompilerPlatform.1.0.4\build\net45\Microsoft.CodeDom.Providers.DotNetCompilerPlatform.props'))" />
</Target>
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
<Target Name="BeforeBuild">
</Target>
<Target Name="AfterBuild">
</Target>
-->
</Project>

112
Crawl/Crawl/Crawl.cs Normal file
Просмотреть файл

@ -0,0 +1,112 @@
//------------------------------------------------------------------------------
// <copyright company="Microsoft Corporation">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
using System.Globalization;
using System.Threading.Tasks;
using Microsoft.Azure.WebJobs.Host;
using System.IO;
using System.Net;
using System.Net.Http;
using Crawl.Crawl;
using Newtonsoft.Json;
using System;
using Microsoft.DecisionService.Crawl.Data;
using System.Collections.Generic;
using System.Diagnostics;
using System.Text;
using System.Text.RegularExpressions;
using Microsoft.ApplicationInsights.DataContracts;
using Microsoft.ApplicationInsights;
namespace Microsoft.DecisionService.Crawl
{
public class Crawl
{
// <meta property="microsoft:ds_id" content="some-id">
//private static Regex MetaMicrosoftDsIdRegex = new Regex(@"<meta[^>]+property\s*=\s*[""']microsoft:ds_id[""'][^>]*>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
//private static Regex MetaContentRegex = new Regex(@"content\s*=\s*[""']([^""']+)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
public static async Task<HttpResponseMessage> Run(HttpRequestMessage req, TraceWriter log)
{
CrawlRequest crawlRequest = null;
string reqBodyStr = null;
try
{
using (var operation = Services.TelemetryClient.StartOperation<DependencyTelemetry>("Crawl.HTML"))
{
reqBodyStr = await req.Content.ReadAsStringAsync();
var reqBody = JsonConvert.DeserializeObject<CrawlRequest>(reqBodyStr);
operation.Telemetry.Properties.Add("AppId", reqBody.Site);
operation.Telemetry.Properties.Add("ActionId", reqBody.Id);
operation.Telemetry.Properties.Add("Url", reqBody.Url);
log.Info($"Crawl AppId={reqBody.Site} Id={reqBody.Id} Url={reqBody.Url}");
var request = (HttpWebRequest)WebRequest.Create(reqBody.Url);
if (!string.IsNullOrEmpty(reqBody.ETag))
request.Headers.Add(HttpRequestHeader.IfNoneMatch, reqBody.ETag);
request.Method = "GET";
request.KeepAlive = true;
request.UserAgent = "DSbot/1.0 (+https://ds.microsoft.com/bot.htm)";
using (var response = (HttpWebResponse)await request.GetResponseAsync())
{
operation.Telemetry.ResultCode = response.StatusCode.ToString();
using (var stream = response.GetResponseStream())
using (var reader = new StreamReader(stream))
{
// TODO: allow direct JSON
// TODO: look for schema.org
var html = await reader.ReadToEndAsync();
// TODO: support microsoft:ds_id
var result = HtmlExtractor.Parse(html, new Uri(reqBody.Url));
result.Url = reqBody.Url;
result.Site = reqBody.Site;
result.Id = reqBody.Id;
return new HttpResponseMessage(HttpStatusCode.OK)
{
Content = new StringContent(
JsonConvert.SerializeObject(result, new JsonSerializerSettings
{
Formatting = Formatting.None,
StringEscapeHandling = StringEscapeHandling.EscapeNonAscii
}),
new UTF8Encoding(encoderShouldEmitUTF8Identifier: false),
"application/json")
};
}
}
}
}
catch (Exception ex)
{
var props = new Dictionary<string, string>
{
{ "Service", req.RequestUri.ToString() }
};
if (crawlRequest == null)
props.Add("JSON", reqBodyStr);
else
{
props.Add("Url", crawlRequest.Url);
props.Add("AppId", crawlRequest.Site);
props.Add("ActionId", crawlRequest.Id);
}
Services.TelemetryClient.TrackException(ex, props);
throw ex;
}
}
}
}

Просмотреть файл

@ -0,0 +1,203 @@
//------------------------------------------------------------------------------
// <copyright company="Microsoft Corporation">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
using Microsoft.DecisionService.Crawl.Data;
using HtmlAgilityPack;
using Newtonsoft.Json.Linq;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;
namespace Microsoft.DecisionService.Crawl
{
/// <summary>
/// https://moz.com/blog/meta-data-templates-123
/// </summary>
public static class HtmlExtractor
{
private static readonly HashSet<string> TitleProperties;
private static readonly HashSet<string> DescriptionProperties;
static HtmlExtractor()
{
TitleProperties = new HashSet<string>
{ "og:title", "twitter:title" };
DescriptionProperties = new HashSet<string>
{ "og:description", "twitter:description" };
}
private static string FirstOrNull(HtmlNodeCollection collection, HashSet<string> properties)
{
var node = collection.First(n => properties.Contains(n.Attributes["property"].Name));
return node != null ? node.Attributes["content"].Value : null;
}
private static string FindMeta(HtmlNode headNode, string xpath)
{
var nodes = headNode.SelectNodes(xpath);
if (nodes == null)
return null;
foreach (var node in nodes)
{
var attr = node.Attributes["content"];
if (attr != null)
return attr.Value;
attr = node.Attributes["value"];
if (attr != null)
return attr.Value;
}
return null;
}
private static string FindValue(HtmlNode headNode, string xpath)
{
var nodes = headNode.SelectNodes(xpath);
if (nodes == null)
return null;
foreach (var node in nodes)
{
var title = new StringBuilder();
StripTags(node, title);
if (title.Length > 0)
return title.ToString();
}
return null;
}
private static IEnumerable<string> FindAll(HtmlNode headNode, string xpath)
{
var nodes = headNode.SelectNodes(xpath);
if (nodes == null)
yield break;
foreach (var node in nodes)
{
var attr = node.Attributes["content"];
if (attr != null)
yield return attr.Value;
attr = node.Attributes["value"];
if (attr != null)
yield return attr.Value;
}
}
private static HashSet<string> skipTags = new HashSet<string>()
{
"script", "style"
};
private static void StripTags(HtmlNode root, StringBuilder plaintext)
{
foreach (var node in root.ChildNodes)
{
if (skipTags.Contains(node.Name.ToLowerInvariant()) || node.NodeType == HtmlNodeType.Comment)
continue;
if (!node.HasChildNodes)
{
string text = node.InnerText;
if (!string.IsNullOrWhiteSpace(text))
plaintext.Append(text.Trim()).Append(' ');
}
else
StripTags(node, plaintext);
}
}
public static string StripTags(HtmlNode root)
{
var plaintext = new StringBuilder();
StripTags(root, plaintext);
return plaintext.ToString();
}
public static CrawlResponse Parse(string html, Uri sourceUrl)
{
var response = new CrawlResponse();
var doc = new HtmlDocument();
doc.LoadHtml(html);
var head = doc.DocumentNode.SelectSingleNode("html/head");
if (head == null)
return response;
response.Title = FindMeta(head, "meta[@property='og:title' or name='og:title' or @property='twitter:title' or @name='twitter:title']");
if (string.IsNullOrEmpty(response.Title))
response.Title = FindValue(head, "title");
if (!string.IsNullOrEmpty(response.Title))
response.Title = WebUtility.HtmlDecode(response.Title.Trim());
response.Description = FindMeta(head, "meta[@property='og:description' or name='og:description' or @property='twitter:description' or @name='twitter:description' or @name='description']");
if (string.IsNullOrEmpty(response.Description))
response.Title = FindValue(head, "title");
if (response.Description != null)
response.Description = WebUtility.HtmlDecode(response.Description.Trim());
response.Type = FindMeta(head, "meta[@property='og:type' or name='og:type']");
var categories = FindAll(head, "meta[@property='article:tag' or @name='article:tag']").ToList();
if (categories.Count > 0)
response.Categories = categories;
// TODO: get the better resolution
var img = FindMeta(head, "meta[@property='og:image' or name='og:image' or @property='twitter:image' or @name='twitter:image']");
if (img != null)
{
if (img.StartsWith("//"))
img = sourceUrl.Scheme + ":" + img;
// TODO: support relative URLs too
response.Image = img;
}
// build article
var articleText = new StringBuilder();
var articles = doc.DocumentNode.SelectNodes("//article");
if (articles != null)
{
// find the longest article text
string text = null;
foreach (var art in articles)
{
var newText = StripTags(art);
if (text == null || text.Length < newText.Length)
text = newText;
}
if (!string.IsNullOrEmpty(text))
articleText.AppendLine(text);
}
response.Article = WebUtility.HtmlDecode(articleText.ToString());
// <meta property="microsoft:ds_id" content="255308" data-react-helmet="true">
var dsId = FindMeta(head, "meta[@property='microsoft:ds_id' or name='microsoft:ds_id']");
response.PassThroughDetails = WebUtility.HtmlDecode(dsId);
return response;
}
}
}

18
Crawl/Crawl/function.json Normal file
Просмотреть файл

@ -0,0 +1,18 @@
{
"scriptFile": "..\\bin\\Crawl.dll",
"entryPoint": "Microsoft.DecisionService.Crawl.Crawl.Run",
"bindings": [
{
"authLevel": "function",
"name": "req",
"type": "httpTrigger",
"direction": "in"
},
{
"name": "$return",
"type": "http",
"direction": "out"
}
],
"disabled": false
}

Просмотреть файл

@ -0,0 +1,29 @@
//------------------------------------------------------------------------------
// <copyright company="Microsoft Corporation">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
using Newtonsoft.Json;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
namespace Microsoft.DecisionService.Crawl.Data
{
public class CrawlRequest
{
[JsonProperty("site")]
public string Site { get; set; }
[JsonProperty("id")]
public string Id { get; set; }
[JsonProperty("url")]
public string Url { get; set; }
[JsonProperty("etag")]
public string ETag { get; set; }
}
}

Просмотреть файл

@ -0,0 +1,50 @@
//------------------------------------------------------------------------------
// <copyright company="Microsoft Corporation">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
using Newtonsoft.Json;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
namespace Microsoft.DecisionService.Crawl.Data
{
public class CrawlResponse
{
[JsonProperty("site")]
public string Site { get; set; }
[JsonProperty("id")]
public string Id { get; set; }
[JsonProperty("url", NullValueHandling = NullValueHandling.Ignore)]
public string Url { get; set; }
[JsonProperty("title", NullValueHandling = NullValueHandling.Ignore)]
public string Title { get; set; }
[JsonProperty("description", NullValueHandling = NullValueHandling.Ignore)]
public string Description { get; set; }
[JsonProperty("type", NullValueHandling = NullValueHandling.Ignore)]
public string Type { get; set; }
[JsonProperty("categories", NullValueHandling = NullValueHandling.Ignore)]
public List<string> Categories { get; set; }
[JsonProperty("image", NullValueHandling = NullValueHandling.Ignore)]
public string Image { get; set; }
[JsonProperty("article", NullValueHandling = NullValueHandling.Ignore)]
public string Article { get; set; }
[JsonProperty("ds_id", NullValueHandling = NullValueHandling.Ignore)]
public string PassThroughDetails { get; set; }
[JsonProperty("forceRefresh")]
public bool ForceRefresh { get; set; } = false;
}
}

20
Crawl/Data/UrlHolder.cs Normal file
Просмотреть файл

@ -0,0 +1,20 @@
//------------------------------------------------------------------------------
// <copyright company="Microsoft Corporation">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
using Newtonsoft.Json;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
namespace Microsoft.DecisionService.Crawl.Data
{
public class UrlHolder
{
[JsonProperty("url")]
public string Url { get; set; }
}
}

235
Crawl/HttpCachedService.cs Normal file
Просмотреть файл

@ -0,0 +1,235 @@
//------------------------------------------------------------------------------
// <copyright company="Microsoft Corporation">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
using Crawl.Data;
using Microsoft.ApplicationInsights;
using Microsoft.ApplicationInsights.DataContracts;
using Microsoft.Azure.KeyVault;
using Microsoft.Azure.WebJobs.Host;
using Newtonsoft.Json;
using System;
using System.Collections.Generic;
using System.Configuration;
using System.Diagnostics;
using System.Globalization;
using System.Net.Http;
using System.Security.Cryptography.X509Certificates;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Web.Configuration;
namespace Microsoft.DecisionService.Crawl
{
public class HttpCachedService
{
internal readonly string containerName;
internal HttpClient client;
internal string endpoint;
internal string apiKey;
internal string storageConnectionString;
public HttpCachedService(string containerName)
{
// limit due to Azure Storage container name
if (containerName.Length > 24 - 6 /* yyyyMM */)
throw new ArgumentException($"{nameof(containerName)}: '{containerName}' is too long. Must be {24 - 6} characters at most.");
this.containerName = containerName;
}
protected virtual void Initialize()
{ }
private async Task InitializeAsync()
{
if (this.client != null)
return;
var keyVaultUrl = ConfigurationManager.AppSettings["KeyVaultUrl"];
var keyVaultHelper = new KeyVaultHelper(
StoreLocation.CurrentUser,
ConfigurationManager.AppSettings["AzureActiveDirectoryClientId"],
ConfigurationManager.AppSettings["AzureActiveDirectoryCertificateThumbprint"]);
var keyVault = new KeyVaultClient(new KeyVaultClient.AuthenticationCallback(keyVaultHelper.GetAccessToken));
this.endpoint = (await keyVault.GetSecretAsync(keyVaultUrl, containerName + "Endpoint").ConfigureAwait(false)).Value;
this.apiKey = (await keyVault.GetSecretAsync(keyVaultUrl, containerName + "Key").ConfigureAwait(false)).Value;
this.storageConnectionString = (await keyVault.GetSecretAsync(keyVaultUrl, "StorageConnectionString").ConfigureAwait(false)).Value;
this.client = new HttpClient()
{
BaseAddress = new Uri(this.endpoint)
};
this.Initialize();
}
public async Task<BlobContent> PostAsync(TraceWriter log, string site, string id, object request, bool forceRefresh, CancellationToken cancellationToken)
{
await this.InitializeAsync();
var stopwatch = Stopwatch.StartNew();
var cacheHit = true;
HttpResponseMessage responseMessage = null;
string body = null;
try
{
body = request as string;
string input;
string contentType;
if (body != null)
{
// if this is a raw string, we need to escape for storage
input = JsonConvert.SerializeObject(request);
contentType = "text/plain";
}
else
{
body = JsonConvert.SerializeObject(request);
input = body;
contentType = "application/json";
}
log.Trace(new TraceEvent(TraceLevel.Verbose,
$"Requesting {this.containerName} at {this.endpoint}: {body}"));
var blobCache = new BlobCache(this.storageConnectionString);
// lookup Azure Blob storage cache first
// have a 5min timeout for retries
BlobContent blobContent = null;
if (!forceRefresh)
blobContent = await blobCache.GetAsync(site, id, this.containerName, input, TimeSpan.FromMinutes(5), cancellationToken);
if (blobContent == null)
{
cacheHit = false;
var stopwatchReqeust = Stopwatch.StartNew();
// make the actual HTTP request
responseMessage = await this.client.PostAsync(
string.Empty,
new StringContent(
body,
new UTF8Encoding(encoderShouldEmitUTF8Identifier: false),
contentType));
Services.TelemetryClient.TrackDependency(this.containerName, this.endpoint, this.containerName, null,
DateTime.UtcNow, stopwatchReqeust.Elapsed,
$"{responseMessage.StatusCode} {responseMessage.ReasonPhrase}", responseMessage.IsSuccessStatusCode);
log.Trace(new TraceEvent(TraceLevel.Verbose, $"Response: {responseMessage.StatusCode} {responseMessage.ReasonPhrase}"));
if (!responseMessage.IsSuccessStatusCode)
{
blobContent = new BlobContent
{
// TODO: random expiration
Expires = DateTime.UtcNow + TimeSpan.FromMinutes(1),
};
}
else
{
var responseStr = await responseMessage.Content.ReadAsStringAsync();
log.Trace(new TraceEvent(TraceLevel.Verbose, $"Result {this.containerName} at {this.endpoint}: {responseStr}"));
// once we got a response, cache for 3 days
// TODO: add configuration option
// TODO: add force refresh parameter
blobContent = await blobCache.PersistAsync(site, id, this.containerName, input, responseStr, TimeSpan.FromDays(3), cancellationToken);
}
}
return blobContent;
}
finally
{
var props = new Dictionary<string, string>
{
{ "site", site },
{ "id", id },
{ "cacheHit", cacheHit.ToString() },
{ "StatusCode", responseMessage?.StatusCode.ToString() },
{ "Reason", responseMessage?.ReasonPhrase }
};
var sb = new StringBuilder(this.containerName);
if (responseMessage != null && responseMessage.StatusCode != System.Net.HttpStatusCode.OK)
{
props.Add("Request", body);
sb.Append(" failed");
}
Services.TelemetryClient.TrackEvent(
sb.ToString(),
props,
metrics: new Dictionary<string, double>
{
{ "requestTime", stopwatch.ElapsedMilliseconds }
});
}
}
public async Task<HttpResponseMessage> InvokeAsync(HttpRequestMessage req, TraceWriter log,
Func<CrawlResponse, object> requestBodyFunc,
Action<CrawlResponse, BlobContent> responseAction,
CancellationToken cancellationToken)
{
log.Info("Crawl." + this.containerName);
await this.InitializeAsync();
string reqBodyStr = null;
CrawlResponse reqBody = null;
BlobContent blobContent = null;
try
{
using (var operation = Services.TelemetryClient.StartOperation<DependencyTelemetry>("Crawl." + this.containerName))
{
reqBodyStr = await req.Content.ReadAsStringAsync();
reqBody = JsonConvert.DeserializeObject<CrawlResponse>(reqBodyStr);
operation.Telemetry.Target = this.endpoint;
operation.Telemetry.Properties.Add("AppId", reqBody.Site);
operation.Telemetry.Properties.Add("ActionId", reqBody.Id);
blobContent = await this.PostAsync(
log,
reqBody.Site,
reqBody.Id,
requestBodyFunc(reqBody),
reqBody.ForceRefresh,
cancellationToken);
if (blobContent != null)
{
operation.Telemetry.Properties.Add("Expires", blobContent.Expires.ToString(CultureInfo.InvariantCulture));
if (blobContent.Value != null)
{
responseAction(reqBody, blobContent);
operation.Telemetry.ResultCode = "OK";
}
}
return req.CreateResponse(blobContent);
}
}
catch (Exception ex)
{
Services.TrackException(ex, req, log, reqBodyStr, reqBody, blobContent);
throw ex;
}
}
}
}

27
Crawl/KeyVaultHelper.cs Normal file
Просмотреть файл

@ -0,0 +1,27 @@
using Microsoft.IdentityModel.Clients.ActiveDirectory;
using System;
using System.Security.Cryptography.X509Certificates;
using System.Threading.Tasks;
namespace Microsoft.DecisionService.Crawl
{
/// <summary>
/// see https://azure.microsoft.com/en-us/documentation/articles/key-vault-use-from-web-application/.
/// </summary>
public class KeyVaultHelper
{
private readonly ClientAssertionCertificate assertionCert;
public KeyVaultHelper(StoreLocation storeLocation, string clientId, string thumbprint)
{
var clientAssertionCertPfx = CertificateUtil.FindCertificateByThumbprint(storeLocation, thumbprint);
this.assertionCert = new ClientAssertionCertificate(clientId, clientAssertionCertPfx);
}
public async Task<string> GetAccessToken(string authority, string resource, string scope)
{
var context = new AuthenticationContext(authority, TokenCache.DefaultShared);
return (await context.AcquireTokenAsync(resource, assertionCert)).AccessToken;
}
}
}

Просмотреть файл

@ -0,0 +1,35 @@
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
// General Information about an assembly is controlled through the following
// set of attributes. Change these attribute values to modify the information
// associated with an assembly.
[assembly: AssemblyTitle("Crawl")]
[assembly: AssemblyDescription("")]
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("")]
[assembly: AssemblyProduct("Crawl")]
[assembly: AssemblyCopyright("Copyright © 2017")]
[assembly: AssemblyTrademark("")]
[assembly: AssemblyCulture("")]
// Setting ComVisible to false makes the types in this assembly not visible
// to COM components. If you need to access a type in this assembly from
// COM, set the ComVisible attribute to true on that type.
[assembly: ComVisible(false)]
// The following GUID is for the ID of the typelib if this project is exposed to COM
[assembly: Guid("28285b58-63a5-48f9-99da-7498e0e3ab22")]
// Version information for an assembly consists of the following four values:
//
// Major Version
// Minor Version
// Build Number
// Revision
//
// You can specify all the values or you can default the Revision and Build Numbers
// by using the '*' as shown below:
[assembly: AssemblyVersion("1.0.0.0")]
[assembly: AssemblyFileVersion("1.0.0.0")]

147
Crawl/RSS/RSS.cs Normal file
Просмотреть файл

@ -0,0 +1,147 @@
//------------------------------------------------------------------------------
// <copyright company="Microsoft Corporation">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
using System.Globalization;
using System.Threading.Tasks;
using Microsoft.Azure.WebJobs.Host;
using System.Net.Http;
using System;
using Crawl.Data;
using Crawl;
using System.Collections.Generic;
using System.Xml.Linq;
using System.Linq;
using System.Diagnostics;
using System.Text;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
namespace Microsoft.DecisionService.Crawl
{
public sealed class RSS
{
private static HttpClient client = new HttpClient();
public class URLHolder
{
[JsonProperty("url")]
public string Url { get; set; }
}
public static async Task<HttpResponseMessage> Run(HttpRequestMessage req, TraceWriter log)
{
var url = string.Empty;
var stopwatch = Stopwatch.StartNew();
var jsonResponse = string.Empty;
try
{
var reqBodyStr = await req.Content.ReadAsStringAsync();
var reqBody = JsonConvert.DeserializeObject<URLHolder>(reqBodyStr);
url = reqBody.Url;
log.Info("RSS " + url);
// TODO: use HttpCachedService (also as means of failover if the RSS stream is down)
string data = await client.GetStringAsync(reqBody.Url.ToString());
var rss = XDocument.Parse(data);
string parseFormat = "ddd, dd MMM yyyy HH:mm:ss zzz";
string parseFormat2 = "ddd, dd MMM yyyy HH:mm:ss Z";
var items = rss.DescendantNodes()
.OfType<XElement>()
.Where(a => a.Name == "item")
.Select((elem, index) =>
{
var pubDateStr = elem.Descendants("pubDate").FirstOrDefault()?.Value;
if (pubDateStr != null)
pubDateStr = pubDateStr.Trim();
if (!DateTime.TryParseExact(pubDateStr, parseFormat, CultureInfo.InvariantCulture, DateTimeStyles.None, out DateTime pubDate))
if (!DateTime.TryParseExact(pubDateStr, parseFormat2, CultureInfo.InvariantCulture, DateTimeStyles.None, out pubDate))
pubDate = DateTime.UtcNow;
return new { elem, pubDate, index };
})
.OrderByDescending(elem => elem.pubDate)
// limit the feed to avoid getting too many
.Take(15)
// Note: this is very important for the Dashboard
// The order of the items allows customers to specify their base-line policy
.OrderBy(elem => elem.index)
.Select(x => x.elem);
var actions = items.Select(x => new
{
ids = new[] { new { id = x.Descendants("link").FirstOrDefault()?.Value } },
features = new
{
_title = x.Descendants("title").FirstOrDefault()?.Value
},
details = new []
{
// TODO: properly support 4.2.6. The "atom:id" Element
new { guid = x.Descendants("guid").FirstOrDefault()?.Value }
}
}).ToList();
jsonResponse = JsonConvert.SerializeObject(actions);
if (log.Level == TraceLevel.Verbose)
log.Trace(new TraceEvent(TraceLevel.Verbose, $"Successfully transformed '{url}' '{data}' to '{jsonResponse}'"));
else
log.Info($"Successfully transformed '{url}'");
}
catch (HttpRequestException hre)
{
var msg = $"RSS Featurization failed '{url}' for '{req.RequestUri.ToString()}': '{hre.Message}'";
log.Warning(msg);
// TODO: maybe switch to dependency w/ status failed?
Services.TelemetryClient.TrackEvent(msg,
new Dictionary<string, string>
{
{ "Service", req.RequestUri.ToString() },
{ "Url", url },
{ "Exception", hre.Message}
});
}
catch (Exception ex)
{
log.Error($"Failed to process '{url}'", ex);
Services.TelemetryClient.TrackException(
ex,
new Dictionary<string, string>
{
{ "Service", req.RequestUri.ToString() },
{ "Url", url }
});
// swallow the error message and return empty. That way we can differentiate between real outages
// remote errors
}
finally
{
Services.TelemetryClient.TrackEvent($"RSS {url}",
metrics: new Dictionary<string, double>
{
{ "requestTime", stopwatch.ElapsedMilliseconds }
});
}
return new HttpResponseMessage(System.Net.HttpStatusCode.OK)
{
Content = new StringContent(
jsonResponse,
new UTF8Encoding(encoderShouldEmitUTF8Identifier: false),
"application/json")
};
}
}
}

18
Crawl/RSS/function.json Normal file
Просмотреть файл

@ -0,0 +1,18 @@
{
"scriptFile": "..\\bin\\Crawl.dll",
"entryPoint": "Microsoft.DecisionService.Crawl.RSS.Run",
"bindings": [
{
"authLevel": "function",
"name": "req",
"type": "httpTrigger",
"direction": "in"
},
{
"name": "$return",
"type": "http",
"direction": "out"
}
],
"disabled": false
}

121
Crawl/RawStringConverter.cs Normal file
Просмотреть файл

@ -0,0 +1,121 @@
using Newtonsoft.Json;
using System;
using System.Globalization;
using System.Text;
namespace Microsoft.DecisionService.Crawl
{
/// <summary>
/// Custom JSON converter returning the underlying raw json (avoiding object allocation)
/// </summary>
public class RawStringConverter : JsonConverter
{
public override bool CanConvert(Type objectType)
{
return true;
}
public override object ReadJson(JsonReader reader, Type objectType, object existingValue, JsonSerializer serializer)
{
var sb = new StringBuilder();
JsonToken previousToken = JsonToken.None;
int depth = 0;
do
{
if (sb.Length > 0)
{
if (!reader.Read())
{
break;
}
if ((previousToken == JsonToken.Boolean
|| previousToken == JsonToken.Date || previousToken == JsonToken.String
|| previousToken == JsonToken.Float || previousToken == JsonToken.Integer
|| previousToken == JsonToken.Raw || previousToken == JsonToken.Null
|| previousToken == JsonToken.Bytes) &&
(reader.TokenType != JsonToken.EndArray && reader.TokenType != JsonToken.EndObject))
{
sb.Append(",");
}
else if ((previousToken == JsonToken.EndObject && reader.TokenType == JsonToken.StartObject)
|| (previousToken == JsonToken.EndArray && reader.TokenType == JsonToken.StartArray))
{
sb.Append(",");
}
}
switch (reader.TokenType)
{
case JsonToken.PropertyName:
if (previousToken == JsonToken.EndObject || previousToken == JsonToken.EndArray)
{
sb.Append(',');
}
sb.AppendFormat(CultureInfo.InvariantCulture, "\"{0}\":", reader.Value.ToString().Replace("\"", "\\\""));
break;
case JsonToken.Boolean:
sb.AppendFormat((bool)reader.Value ? "true" : "false");
break;
case JsonToken.Bytes:
case JsonToken.Comment:
case JsonToken.Integer:
case JsonToken.Float:
sb.AppendFormat(CultureInfo.InvariantCulture, "{0}", reader.Value);
break;
case JsonToken.Date:
sb.Append(JsonConvert.SerializeObject(reader.Value));
break;
case JsonToken.Null:
sb.Append("null");
break;
case JsonToken.String:
sb.Append(JsonConvert.SerializeObject((string)reader.Value));
break;
case JsonToken.Raw:
sb.Append(reader.Value);
break;
case JsonToken.StartArray:
sb.Append('[');
depth++;
break;
case JsonToken.EndArray:
sb.Append(']');
depth--;
break;
case JsonToken.StartObject:
sb.Append('{');
depth++;
break;
case JsonToken.EndObject:
sb.Append('}');
depth--;
break;
}
previousToken = reader.TokenType;
}
while (depth > 0);
return sb.ToString();
}
public override void WriteJson(JsonWriter writer, object value, JsonSerializer serializer)
{
writer.WriteRawValue((string)value);
}
}
}

89
Crawl/Services.cs Normal file
Просмотреть файл

@ -0,0 +1,89 @@
//------------------------------------------------------------------------------
// <copyright company="Microsoft Corporation">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
using Crawl.Data;
using Microsoft.ApplicationInsights;
using Microsoft.ApplicationInsights.Extensibility;
using Microsoft.Azure.WebJobs.Host;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Web;
namespace Microsoft.DecisionService.Crawl
{
public static class Services
{
public readonly static TelemetryClient TelemetryClient;
static Services()
{
TelemetryConfiguration.Active.InstrumentationKey = System.Configuration.ConfigurationManager.AppSettings["AppInsightsKey"];
TelemetryClient = new TelemetryClient();
TelemetryClient.Context.Cloud.RoleName = "Crawl";
TelemetryClient.Context.Component.Version = typeof(Services).Assembly.GetName().Version.ToString();
}
public static string Limit(string text, int numBytes)
{
if (Encoding.UTF8.GetByteCount(text) < numBytes)
return text;
var chars = text.ToCharArray();
var length = Math.Min(text.Length, numBytes);
while (Encoding.UTF8.GetByteCount(chars, 0, length) > numBytes)
length--;
return text.Substring(length);
}
public static HttpResponseMessage CreateResponse(this HttpRequestMessage req, BlobContent blobContent)
{
blobContent.Output?.Add(new JProperty("_expires", blobContent.Expires));
var response = new HttpResponseMessage(System.Net.HttpStatusCode.OK)
{
Content = new StringContent(
blobContent.Output?.ToString(Formatting.None) ?? string.Empty,
new UTF8Encoding(encoderShouldEmitUTF8Identifier: false),
"application/json")
};
// Get replaced in deployed version
// response.Content.Headers.Expires = expires;
// response.Content.Headers.TryAddWithoutValidation("X-DecisionService-Expires", expires.ToString("ddd, dd MMM yyyy HH:mm:ss 'GMT'"));
return response;
}
public static void TrackException(Exception ex, HttpRequestMessage req, TraceWriter log, string reqBodyStr, CrawlResponse reqBody, BlobContent blobContent)
{
var props = new Dictionary<string, string>
{
{ "Service", req.RequestUri.ToString() },
{ "Request", reqBodyStr }
};
if (reqBody != null)
{
props.Add("AppId", reqBody.Site);
props.Add("ActionId", reqBody.Id);
}
if (blobContent != null)
props.Add("Response", blobContent.Value);
TelemetryClient.TrackException(ex, props);
log.Error($"Request for AppId={reqBody?.Site} ActionId={reqBody?.Id} failed", ex);
}
}
}

30
Crawl/Web.Debug.config Normal file
Просмотреть файл

@ -0,0 +1,30 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- For more information on using web.config transformation visit https://go.microsoft.com/fwlink/?LinkId=125889 -->
<configuration xmlns:xdt="http://schemas.microsoft.com/XML-Document-Transform">
<!--
In the example below, the "SetAttributes" transform will change the value of
"connectionString" to use "ReleaseSQLServer" only when the "Match" locator
finds an attribute "name" that has a value of "MyDB".
<connectionStrings>
<add name="MyDB"
connectionString="Data Source=ReleaseSQLServer;Initial Catalog=MyReleaseDB;Integrated Security=True"
xdt:Transform="SetAttributes" xdt:Locator="Match(name)"/>
</connectionStrings>
-->
<system.web>
<!--
In the example below, the "Replace" transform will replace the entire
<customErrors> section of your web.config file.
Note that because there is only one customErrors section under the
<system.web> node, there is no need to use the "xdt:Locator" attribute.
<customErrors defaultRedirect="GenericError.htm"
mode="RemoteOnly" xdt:Transform="Replace">
<error statusCode="500" redirect="InternalError.htm"/>
</customErrors>
-->
</system.web>
</configuration>

31
Crawl/Web.Release.config Normal file
Просмотреть файл

@ -0,0 +1,31 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- For more information on using web.config transformation visit https://go.microsoft.com/fwlink/?LinkId=125889 -->
<configuration xmlns:xdt="http://schemas.microsoft.com/XML-Document-Transform">
<!--
In the example below, the "SetAttributes" transform will change the value of
"connectionString" to use "ReleaseSQLServer" only when the "Match" locator
finds an attribute "name" that has a value of "MyDB".
<connectionStrings>
<add name="MyDB"
connectionString="Data Source=ReleaseSQLServer;Initial Catalog=MyReleaseDB;Integrated Security=True"
xdt:Transform="SetAttributes" xdt:Locator="Match(name)"/>
</connectionStrings>
-->
<system.web>
<compilation xdt:Transform="RemoveAttributes(debug)" />
<!--
In the example below, the "Replace" transform will replace the entire
<customErrors> section of your web.config file.
Note that because there is only one customErrors section under the
<system.web> node, there is no need to use the "xdt:Locator" attribute.
<customErrors defaultRedirect="GenericError.htm"
mode="RemoteOnly" xdt:Transform="Replace">
<error statusCode="500" redirect="InternalError.htm"/>
</customErrors>
-->
</system.web>
</configuration>

50
Crawl/Web.config Normal file
Просмотреть файл

@ -0,0 +1,50 @@
<?xml version="1.0" encoding="utf-8"?>
<!--
For more information on how to configure your ASP.NET application, please visit
https://go.microsoft.com/fwlink/?LinkId=169433
-->
<configuration>
<!--
For a description of web.config changes see http://go.microsoft.com/fwlink/?LinkId=235367.
The following attributes can be set on the <httpRuntime> tag.
<system.Web>
<httpRuntime targetFramework="4.6" />
</system.Web>
-->
<system.web>
<compilation debug="true" targetFramework="4.6.2" />
<httpRuntime targetFramework="4.5.2" />
</system.web>
<runtime>
<assemblyBinding xmlns="urn:schemas-microsoft-com:asm.v1">
<dependentAssembly>
<assemblyIdentity name="Newtonsoft.Json" publicKeyToken="30ad4fe6b2a6aeed" culture="neutral" />
<bindingRedirect oldVersion="0.0.0.0-9.0.0.0" newVersion="9.0.0.0" />
</dependentAssembly>
<dependentAssembly>
<assemblyIdentity name="Microsoft.WindowsAzure.Storage" publicKeyToken="31bf3856ad364e35" culture="neutral" />
<bindingRedirect oldVersion="0.0.0.0-8.1.1.0" newVersion="8.1.1.0" />
</dependentAssembly>
<dependentAssembly>
<assemblyIdentity name="Microsoft.Azure.KeyVault.Core" publicKeyToken="31bf3856ad364e35" culture="neutral" />
<bindingRedirect oldVersion="0.0.0.0-2.0.0.0" newVersion="2.0.0.0" />
</dependentAssembly>
<dependentAssembly>
<assemblyIdentity name="System.Reactive.Core" publicKeyToken="94bc3704cddfc263" culture="neutral" />
<bindingRedirect oldVersion="0.0.0.0-3.0.3000.0" newVersion="3.0.3000.0" />
</dependentAssembly>
<dependentAssembly>
<assemblyIdentity name="System.Net.Http" publicKeyToken="b03f5f7f11d50a3a" culture="neutral" />
<bindingRedirect oldVersion="0.0.0.0-4.1.1.0" newVersion="4.1.1.0" />
</dependentAssembly>
</assemblyBinding>
</runtime>
<system.codedom>
<compilers>
<compiler language="c#;cs;csharp" extension=".cs" type="Microsoft.CodeDom.Providers.DotNetCompilerPlatform.CSharpCodeProvider, Microsoft.CodeDom.Providers.DotNetCompilerPlatform, Version=1.0.4.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35" warningLevel="4" compilerOptions="/langversion:7 /nowarn:1659;1699;1701" />
<compiler language="vb;vbs;visualbasic;vbscript" extension=".vb" type="Microsoft.CodeDom.Providers.DotNetCompilerPlatform.VBCodeProvider, Microsoft.CodeDom.Providers.DotNetCompilerPlatform, Version=1.0.4.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35" warningLevel="4" compilerOptions="/langversion:15 /nowarn:41008 /define:_MYTYPE=\&quot;Web\&quot; /optionInfer+" />
</compilers>
</system.codedom>
</configuration>

3
Crawl/host.json Normal file
Просмотреть файл

@ -0,0 +1,3 @@
{
"id": "ce294cb2fbbc45d7a3473d6160d08a7c"
}

29
Crawl/packages.config Normal file
Просмотреть файл

@ -0,0 +1,29 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="HtmlAgilityPack" version="1.4.9.5" targetFramework="net452" />
<package id="Microsoft.ApplicationInsights" version="2.3.0" targetFramework="net462" />
<package id="Microsoft.Azure.KeyVault" version="2.0.6" targetFramework="net462" />
<package id="Microsoft.Azure.KeyVault.Core" version="2.0.4" targetFramework="net452" />
<package id="Microsoft.Azure.KeyVault.WebKey" version="2.0.5" targetFramework="net462" />
<package id="Microsoft.Azure.WebJobs" version="2.0.0" targetFramework="net452" />
<package id="Microsoft.Azure.WebJobs.Core" version="2.0.0" targetFramework="net452" />
<package id="Microsoft.CodeDom.Providers.DotNetCompilerPlatform" version="1.0.4" targetFramework="net462" />
<package id="Microsoft.Data.Edm" version="5.8.2" targetFramework="net452" />
<package id="Microsoft.Data.OData" version="5.8.2" targetFramework="net452" />
<package id="Microsoft.Data.Services.Client" version="5.8.2" targetFramework="net452" />
<package id="Microsoft.IdentityModel.Clients.ActiveDirectory" version="3.13.9" targetFramework="net462" />
<package id="Microsoft.Net.Compilers" version="2.1.0" targetFramework="net462" developmentDependency="true" />
<package id="Microsoft.Rest.ClientRuntime" version="2.3.7" targetFramework="net462" />
<package id="Microsoft.Rest.ClientRuntime.Azure" version="3.3.6" targetFramework="net462" />
<package id="Newtonsoft.Json" version="9.0.1" targetFramework="net462" />
<package id="System.ComponentModel.EventBasedAsync" version="4.3.0" targetFramework="net452" />
<package id="System.Dynamic.Runtime" version="4.3.0" targetFramework="net452" />
<package id="System.Linq.Queryable" version="4.3.0" targetFramework="net452" />
<package id="System.Net.Requests" version="4.3.0" targetFramework="net452" />
<package id="System.Security.Cryptography.Algorithms" version="4.3.0" targetFramework="net462" />
<package id="System.Security.Cryptography.Encoding" version="4.3.0" targetFramework="net462" />
<package id="System.Security.Cryptography.Primitives" version="4.3.0" targetFramework="net462" />
<package id="System.Security.Cryptography.X509Certificates" version="4.3.0" targetFramework="net462" />
<package id="System.Spatial" version="5.8.2" targetFramework="net452" />
<package id="WindowsAzure.Storage" version="8.1.1" targetFramework="net452" />
</packages>

18
Crawl/readme.MD Normal file
Просмотреть файл

@ -0,0 +1,18 @@
Install Azure Function CLI tools
npm install -g azure-functions-cli
Run on command line from Crawl directory
%AppData%\npm\func run .
You can attach VS to func.exe.
appsettings.json has the Configuration Manager settings.
(Invoke-WebRequest -Method Post -Body $r.Content -ContentType 'application/json' 'http://localhost:7071/api/CognitiveServiceVision').Content
curl -v -X POST http://localhost:7071/api/CognitiveServiceVision -H "Content-Type: application/json" -d @vision.json -H "Accept: application/json"