Add RegexTransformation, ParseDateTransformation

This commit is contained in:
Ovi Dan 2018-11-24 11:55:48 -08:00
Родитель 303834f6c8
Коммит 75b9c34461
13 изменённых файлов: 449 добавлений и 17 удалений

Просмотреть файл

@ -23,6 +23,18 @@
<None Update="TestData\answers.microsoft.com.json">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="TestData\article_with_date.html">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="TestData\regex_expected_result.json">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="TestData\parse_date_rules.json">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="TestData\regex_rules.json">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="TestData\quora.com.html">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>

Просмотреть файл

@ -6,13 +6,14 @@
namespace Microsoft.Search.StructuredDataExtraction.Tests
{
using System.Globalization;
using System.IO;
using OpenScraping.Config;
using OpenScraping;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using OpenScraping;
using OpenScraping.Config;
using System;
using System.Globalization;
using System.IO;
[TestClass]
public class StructuredDataExtractionTests
@ -437,5 +438,37 @@ namespace Microsoft.Search.StructuredDataExtraction.Tests
Assert.AreEqual("Para1 content Para2 content", parsedJson["body"].Value, "The extracted body is incorrect");
}
[TestMethod]
public void RegexTest()
{
var configPath = Path.Combine("TestData", "regex_rules.json");
var config = StructuredDataConfig.ParseJsonFile(configPath);
var extractor = new StructuredDataExtractor(config);
var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_date.html")));
var actualJson = JsonConvert.SerializeObject(result, Formatting.Indented);
var parsedActualJson = JObject.Parse(actualJson);
var expectedJsonPath = Path.Combine("TestData", "regex_expected_result.json");
var expectedJson = File.ReadAllText(expectedJsonPath);
var parsedExpectedJson = JObject.Parse(expectedJson);
Assert.IsTrue(JToken.DeepEquals(parsedActualJson, parsedExpectedJson));
}
[TestMethod]
public void ParseDateTest()
{
var configPath = Path.Combine("TestData", "parse_date_rules.json");
var config = StructuredDataConfig.ParseJsonFile(configPath);
var extractor = new StructuredDataExtractor(config);
var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_date.html")));
var json = JsonConvert.SerializeObject(result, Formatting.Indented);
dynamic parsedJson = JsonConvert.DeserializeObject(json);
Assert.AreEqual(DateTime.Parse("2018-11-24T00:00:00"), parsedJson["parsedDateNoFormat"].Value);
Assert.AreEqual(DateTime.Parse("2011-12-30T00:00:00"), parsedJson["parsedDateWithFormat"].Value);
Assert.AreEqual(DateTime.Parse("2008-06-12T00:00:00"), parsedJson["parsedDateNoFormatWithProviderStyle"].Value);
}
}
}

Просмотреть файл

@ -0,0 +1,14 @@
<!DOCTYPE html>
<html lang="en" xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta charset="utf-8" />
<title>Page title</title>
</head>
<body>
<p id="published-timestamp1" class="timestamp">2018-11-24</p>
<p id="published-timestamp2" class="timestamp">Published: <span>12-30-11</span></p>
<p id="published-timestamp3" class="timestamp">Published: 12 Juni 2008</p>
<div class="info">Contact information. Phone: 111-111-111, Address: str.Street 1/1, City. 2017</div>
</body>
</html>

Просмотреть файл

@ -0,0 +1,33 @@
{
"parsedDateNoFormat": {
"_xpath": "//p[@id='published-timestamp1']",
"_transformation": "ParseDateTransformation"
},
"parsedDateWithFormat": {
"_xpath": "//p[@id='published-timestamp2']",
"_transformations": [
{
"_type": "RegexTransformation",
"_regex": "Published: (.*)"
},
{
"_type": "ParseDateTransformation",
"_format": "MM-dd-yy"
}
]
},
"parsedDateNoFormatWithProviderStyle": {
"_xpath": "//p[@id='published-timestamp3']",
"_transformations": [
{
"_type": "RegexTransformation",
"_regex": "Published: (.*)"
},
{
"_type": "ParseDateTransformation",
"_formatProvider": "de-DE",
"_dateStyle": "None"
}
]
}
}

Просмотреть файл

@ -0,0 +1,54 @@
{
"publishedDTNoGroupNameForceArrayFalse": "12-30-11",
"publishedDTWithGroupNameForceArrayFalse": "12-30-11",
"multiMatchNoGroupNameForceArrayFalse": {
"1": "111-111-111",
"2": "str.Street 1/1, City. 2017"
},
"multiMatchWithGroupNameForceArrayFalse": {
"phone": "111-111-111",
"address": "str.Street 1/1, City. 2017"
},
"publishedDTNoGroupNameForceArrayTrue": [
{
"1": [
"12 Juni 2008"
]
}
],
"publishedDTWithGroupNameForceArrayTrue": [
{
"date": [
"12 Juni 2008"
]
}
],
"multiMatchNoGroupNameForceArrayTrue": [
{
"1": [
"111-111-111"
],
"2": [
"str.Street 1/1, City. 2017"
]
}
],
"multiMatchWithGroupNameForceArrayTrue": [
{
"phone": [
"111-111-111"
],
"address": [
"str.Street 1/1, City. 2017"
]
}
],
"multiMatchWithSameGroupNameForceArrayTrue": [
{
"same_group": [
"111-111-111",
"str.Street 1/1, City. 2017"
]
}
]
}

Просмотреть файл

@ -0,0 +1,90 @@
{
"publishedDTNoGroupNameForceArrayFalse": {
"_xpath": "//p[@id='published-timestamp2']",
"_transformations": [
{
"_type": "RegexTransformation",
"_regex": "Published: (.*)"
}
]
},
"publishedDTWithGroupNameForceArrayFalse": {
"_xpath": "//p[@id='published-timestamp2']",
"_transformations": [
{
"_type": "RegexTransformation",
"_regex": "Published: (?<date>.*)",
"_regexOption": "IgnoreCase"
}
]
},
"multiMatchNoGroupNameForceArrayFalse": {
"_xpath": "//div[@class='info']",
"_transformations": [
{
"_type": "RegexTransformation",
"_regex": "^Contact information\\. Phone: ([0-9-]+), Address: (.*)$",
"_regexOptions": [ "IgnoreCase", "CultureInvariant" ]
}
]
},
"multiMatchWithGroupNameForceArrayFalse": {
"_xpath": "//div[@class='info']",
"_transformations": [
{
"_type": "RegexTransformation",
"_regex": "^Contact information\\. Phone: (?<phone>[0-9-]+), Address: (?<address>.*)$"
}
]
},
"publishedDTNoGroupNameForceArrayTrue": {
"_xpath": "//p[@id='published-timestamp3']",
"_transformations": [
{
"_type": "RegexTransformation",
"_regex": "Published: (.*)",
"_forceArray": true
}
]
},
"publishedDTWithGroupNameForceArrayTrue": {
"_xpath": "//p[@id='published-timestamp3']",
"_transformations": [
{
"_type": "RegexTransformation",
"_regex": "Published: (?<date>.*)",
"_forceArray": true
}
]
},
"multiMatchNoGroupNameForceArrayTrue": {
"_xpath": "//div[@class='info']",
"_transformations": [
{
"_type": "RegexTransformation",
"_regex": "^Contact information\\. Phone: ([0-9-]+), Address: (.*)$",
"_forceArray": true
}
]
},
"multiMatchWithGroupNameForceArrayTrue": {
"_xpath": "//div[@class='info']",
"_transformations": [
{
"_type": "RegexTransformation",
"_regex": "^Contact information\\. Phone: (?<phone>[0-9-]+), Address: (?<address>.*)$",
"_forceArray": true
}
]
},
"multiMatchWithSameGroupNameForceArrayTrue": {
"_xpath": "//div[@class='info']",
"_transformations": [
{
"_type": "RegexTransformation",
"_regex": "^Contact information\\. Phone: (?<same_group>[0-9-]+), Address: (?<same_group>.*)$",
"_forceArray": true
}
]
}
}

Просмотреть файл

@ -5,7 +5,7 @@ VisualStudioVersion = 15.0.26730.8
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenScraping", "OpenScraping\OpenScraping.csproj", "{C38AA240-58C8-4081-BFA1-2818DE64A583}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenScraping.Tests", "OpenScraping.Tests\OpenScraping.Tests.csproj", "{AB0BE337-12A9-4807-96F5-73854A81C2D8}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "OpenScraping.Tests", "OpenScraping.Tests\OpenScraping.Tests.csproj", "{AB0BE337-12A9-4807-96F5-73854A81C2D8}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution

Просмотреть файл

@ -8,13 +8,11 @@
<PackageProjectUrl>https://github.com/Microsoft/openscraping-lib-csharp</PackageProjectUrl>
<RepositoryType>https://github.com/Microsoft/openscraping-lib-csharp</RepositoryType>
<PackageTags>html extraction scraping scraper parser parsing open scraping openscraping</PackageTags>
<PackageReleaseNotes>- Added support for _removeXPaths config key, which allows removing some child nodes based on xPath rules BEFORE we process the main parent node. Useful for example when extracting a news article that contains divs that we want to remove BEFORE we extract the body of an article.
- Updated some dependencies like HtmlAgilityPack to the latest stable version.
- Simplified reading configs and added support for specifying some config keys as either singular or plural, and setting their values to either a single value or an array.</PackageReleaseNotes>
<AssemblyVersion>1.2.0.0</AssemblyVersion>
<FileVersion>1.2.0.0</FileVersion>
<PackageReleaseNotes>Added support for Regex transformation and ParseDateTransformation</PackageReleaseNotes>
<AssemblyVersion>1.3.0.0</AssemblyVersion>
<FileVersion>1.3.0.0</FileVersion>
<PackageIconUrl>https://github.com/Microsoft/openscraping-lib-csharp/blob/master/logo.png?raw=true</PackageIconUrl>
<Version>1.2.0</Version>
<Version>1.3.0</Version>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="HtmlAgilityPack" Version="1.8.10" />

Просмотреть файл

@ -41,12 +41,12 @@ namespace OpenScraping.Transformations
var maxLevel = 3;
var maxTitleLength = 200;
if (settings != null && settings["_maxStepsUpward"] != null && ((JValue)settings["_maxStepsUpward"]).Type == JTokenType.Integer)
if (settings != null && settings.ContainsKey("_maxStepsUpward") && ((JValue)settings["_maxStepsUpward"]).Type == JTokenType.Integer)
{
maxLevel = ((JValue)settings["_maxStepsUpward"]).ToObject<int>();
}
if (settings != null && settings["_maxTitleLength"] != null && ((JValue)settings["_maxTitleLength"]).Type == JTokenType.Integer)
if (settings != null && settings.ContainsKey("_maxTitleLength") && ((JValue)settings["_maxTitleLength"]).Type == JTokenType.Integer)
{
maxTitleLength = ((JValue)settings["_maxTitleLength"]).ToObject<int>();
}

Просмотреть файл

@ -0,0 +1,81 @@
// -----------------------------------------------------------------------
// <copyright file="ParseDateTransformation.cs" company="Microsoft">
// Copyright (c) Microsoft. All rights reserved.
// </copyright>
// -----------------------------------------------------------------------
namespace OpenScraping.Transformations
{
using Newtonsoft.Json.Linq;
using System;
using System.Collections.Generic;
using System.Globalization;
public class ParseDateTransformation : ITransformationFromObject, ITransformationFromHtml
{
public object Transform(Dictionary<string, object> settings, object input)
{
if (input != null && input is string)
{
var rawDate = (string)input;
return ParseDate(settings, rawDate);
}
return null;
}
public object Transform(Dictionary<string, object> settings, HtmlAgilityPack.HtmlNode node, List<HtmlAgilityPack.HtmlNode> logicalParents)
{
if (node != null)
{
var rawDate = node.InnerText;
return ParseDate(settings, rawDate);
}
return null;
}
private object ParseDate(Dictionary<string, object> settings, string rawDate)
{
string format = null;
var formatProvider = CultureInfo.InvariantCulture;
var dateStyle = DateTimeStyles.None;
if (settings != null)
{
if (settings.ContainsKey("_format") && ((JValue)settings["_format"]).Type == JTokenType.String)
{
format = settings["_format"].ToString();
}
if (settings.ContainsKey("_formatProvider") && ((JValue)settings["_formatProvider"]).Type == JTokenType.String)
{
var rawFormatProvider = settings["_formatProvider"].ToString();
formatProvider = new CultureInfo(rawFormatProvider);
}
if (settings.ContainsKey("_dateStyle") && ((JValue)settings["_dateStyle"]).Type == JTokenType.String)
{
var rawDateStyle = settings["_dateStyle"].ToString();
dateStyle = (DateTimeStyles)Enum.Parse(typeof(DateTimeStyles), rawDateStyle);
}
}
if (format != null)
{
format = settings["_format"].ToString();
if (DateTime.TryParseExact(rawDate, format, formatProvider, dateStyle, out DateTime date))
{
return date;
}
}
else if (DateTime.TryParse(rawDate, formatProvider, dateStyle, out DateTime date))
{
return date;
}
return null;
}
}
}

Просмотреть файл

@ -0,0 +1,117 @@
// -----------------------------------------------------------------------
// <copyright file="RegexTransformation.cs" company="Microsoft">
// Copyright (c) Microsoft. All rights reserved.
// </copyright>
// -----------------------------------------------------------------------
namespace OpenScraping.Transformations
{
using Newtonsoft.Json.Linq;
using System;
using System.Linq;
using System.Collections.Generic;
using System.Text.RegularExpressions;
public class RegexTransformation : ITransformationFromHtml
{
public object Transform(Dictionary<string, object> settings, HtmlAgilityPack.HtmlNode node, List<HtmlAgilityPack.HtmlNode> logicalParents)
{
if (node != null)
{
var text = node.InnerText;
if (!string.IsNullOrWhiteSpace(text))
{
object regexPatternObj;
if (!settings.TryGetValue("_regex", out regexPatternObj))
{
throw new ArgumentException("Could not find a _regex setting");
}
var forceArray = false;
object forceArrayObj;
if (settings.TryGetValue("_forceArray", out forceArrayObj))
{
forceArray = bool.Parse(forceArrayObj.ToString());
}
var regexOptions = RegexOptions.None;
if (settings.ContainsKey("_regexOption") && ((JToken)settings["_regexOption"]).Type == JTokenType.String)
{
var rawOption = settings["_regexOption"].ToString();
regexOptions = (RegexOptions)Enum.Parse(typeof(RegexOptions), rawOption);
}
if (settings.ContainsKey("_regexOptions") && ((JToken)settings["_regexOptions"]).Type == JTokenType.Array)
{
var rawOptions = (JArray)settings["_regexOptions"];
foreach (var rawOption in rawOptions)
{
regexOptions = regexOptions | (RegexOptions)Enum.Parse(typeof(RegexOptions), (string)rawOption);
}
}
var regexPattern = regexPatternObj.ToString();
var regex = new Regex(regexPattern, regexOptions);
var matches = regex.Matches(text);
if (!forceArray
&& matches.Count == 1
&& matches[0].Groups.Count == 2)
{
return matches[0].Groups[1].Value;
}
var returnedMatches = new List<JObject>();
foreach (Match match in matches)
{
var returnedMatch = new JObject();
// Ignore first group
for (var i = 1; i < match.Groups.Count; i++)
{
var group = match.Groups[i];
var groupName = regex.GroupNameFromNumber(i);
if (!forceArray && group.Captures.Count == 1)
{
returnedMatch[groupName] = group.Value;
}
else
{
var captures = new List<string>();
foreach (var capture in group.Captures)
{
captures.Add(capture.ToString());
}
returnedMatch[groupName] = new JArray(captures);
}
}
returnedMatches.Add(returnedMatch);
}
if (!forceArray
&& returnedMatches.Count == 1)
{
return returnedMatches[0];
}
return new JArray(returnedMatches);
}
}
return null;
}
}
}

Просмотреть файл

@ -24,12 +24,12 @@ namespace OpenScraping.Transformations
if (!string.IsNullOrWhiteSpace(text))
{
if (settings != null && settings["_separator"] != null && ((JValue)settings["_separator"]).Type == JTokenType.String)
if (settings != null && settings.ContainsKey("_separator") && ((JValue)settings["_separator"]).Type == JTokenType.String)
{
separator = settings["_separator"].ToString();
}
if (settings != null && settings["_trim"] != null && ((JValue)settings["_trim"]).Type == JTokenType.Boolean)
if (settings != null && settings.ContainsKey("_trim") && ((JValue)settings["_trim"]).Type == JTokenType.Boolean)
{
trim = (bool)((JValue)settings["_trim"]).Value;
}

Просмотреть файл

@ -26,7 +26,7 @@ namespace OpenScraping.Transformations
var grandParentNode = logicalParents[logicalParents.Count - 2];
HtmlAgilityPack.HtmlNode parentNode = grandParentNode;
if (settings != null && settings["_startingXPath"] != null && ((JValue)settings["_startingXPath"]).Type == JTokenType.String)
if (settings != null && settings.ContainsKey("_startingXPath") && ((JValue)settings["_startingXPath"]).Type == JTokenType.String)
{
var startingXPath = ((JValue)settings["_startingXPath"]).ToObject<string>();