Add RegexTransformation, ParseDateTransformation
This commit is contained in:
Родитель
303834f6c8
Коммит
75b9c34461
|
@ -23,6 +23,18 @@
|
||||||
<None Update="TestData\answers.microsoft.com.json">
|
<None Update="TestData\answers.microsoft.com.json">
|
||||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
</None>
|
</None>
|
||||||
|
<None Update="TestData\article_with_date.html">
|
||||||
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
|
</None>
|
||||||
|
<None Update="TestData\regex_expected_result.json">
|
||||||
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
|
</None>
|
||||||
|
<None Update="TestData\parse_date_rules.json">
|
||||||
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
|
</None>
|
||||||
|
<None Update="TestData\regex_rules.json">
|
||||||
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
|
</None>
|
||||||
<None Update="TestData\quora.com.html">
|
<None Update="TestData\quora.com.html">
|
||||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
</None>
|
</None>
|
||||||
|
|
|
@ -6,13 +6,14 @@
|
||||||
|
|
||||||
namespace Microsoft.Search.StructuredDataExtraction.Tests
|
namespace Microsoft.Search.StructuredDataExtraction.Tests
|
||||||
{
|
{
|
||||||
using System.Globalization;
|
using Microsoft.VisualStudio.TestTools.UnitTesting;
|
||||||
using System.IO;
|
|
||||||
using OpenScraping.Config;
|
|
||||||
using OpenScraping;
|
|
||||||
using Newtonsoft.Json;
|
using Newtonsoft.Json;
|
||||||
using Newtonsoft.Json.Linq;
|
using Newtonsoft.Json.Linq;
|
||||||
using Microsoft.VisualStudio.TestTools.UnitTesting;
|
using OpenScraping;
|
||||||
|
using OpenScraping.Config;
|
||||||
|
using System;
|
||||||
|
using System.Globalization;
|
||||||
|
using System.IO;
|
||||||
|
|
||||||
[TestClass]
|
[TestClass]
|
||||||
public class StructuredDataExtractionTests
|
public class StructuredDataExtractionTests
|
||||||
|
@ -437,5 +438,37 @@ namespace Microsoft.Search.StructuredDataExtraction.Tests
|
||||||
Assert.AreEqual("Para1 content Para2 content", parsedJson["body"].Value, "The extracted body is incorrect");
|
Assert.AreEqual("Para1 content Para2 content", parsedJson["body"].Value, "The extracted body is incorrect");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void RegexTest()
|
||||||
|
{
|
||||||
|
var configPath = Path.Combine("TestData", "regex_rules.json");
|
||||||
|
var config = StructuredDataConfig.ParseJsonFile(configPath);
|
||||||
|
var extractor = new StructuredDataExtractor(config);
|
||||||
|
var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_date.html")));
|
||||||
|
var actualJson = JsonConvert.SerializeObject(result, Formatting.Indented);
|
||||||
|
var parsedActualJson = JObject.Parse(actualJson);
|
||||||
|
|
||||||
|
var expectedJsonPath = Path.Combine("TestData", "regex_expected_result.json");
|
||||||
|
var expectedJson = File.ReadAllText(expectedJsonPath);
|
||||||
|
var parsedExpectedJson = JObject.Parse(expectedJson);
|
||||||
|
|
||||||
|
Assert.IsTrue(JToken.DeepEquals(parsedActualJson, parsedExpectedJson));
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void ParseDateTest()
|
||||||
|
{
|
||||||
|
var configPath = Path.Combine("TestData", "parse_date_rules.json");
|
||||||
|
var config = StructuredDataConfig.ParseJsonFile(configPath);
|
||||||
|
var extractor = new StructuredDataExtractor(config);
|
||||||
|
var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_date.html")));
|
||||||
|
var json = JsonConvert.SerializeObject(result, Formatting.Indented);
|
||||||
|
dynamic parsedJson = JsonConvert.DeserializeObject(json);
|
||||||
|
|
||||||
|
Assert.AreEqual(DateTime.Parse("2018-11-24T00:00:00"), parsedJson["parsedDateNoFormat"].Value);
|
||||||
|
Assert.AreEqual(DateTime.Parse("2011-12-30T00:00:00"), parsedJson["parsedDateWithFormat"].Value);
|
||||||
|
Assert.AreEqual(DateTime.Parse("2008-06-12T00:00:00"), parsedJson["parsedDateNoFormatWithProviderStyle"].Value);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
|
||||||
|
<html lang="en" xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<title>Page title</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p id="published-timestamp1" class="timestamp">2018-11-24</p>
|
||||||
|
<p id="published-timestamp2" class="timestamp">Published: <span>12-30-11</span></p>
|
||||||
|
<p id="published-timestamp3" class="timestamp">Published: 12 Juni 2008</p>
|
||||||
|
<div class="info">Contact information. Phone: 111-111-111, Address: str.Street 1/1, City. 2017</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,33 @@
|
||||||
|
{
|
||||||
|
"parsedDateNoFormat": {
|
||||||
|
"_xpath": "//p[@id='published-timestamp1']",
|
||||||
|
"_transformation": "ParseDateTransformation"
|
||||||
|
},
|
||||||
|
"parsedDateWithFormat": {
|
||||||
|
"_xpath": "//p[@id='published-timestamp2']",
|
||||||
|
"_transformations": [
|
||||||
|
{
|
||||||
|
"_type": "RegexTransformation",
|
||||||
|
"_regex": "Published: (.*)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"_type": "ParseDateTransformation",
|
||||||
|
"_format": "MM-dd-yy"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"parsedDateNoFormatWithProviderStyle": {
|
||||||
|
"_xpath": "//p[@id='published-timestamp3']",
|
||||||
|
"_transformations": [
|
||||||
|
{
|
||||||
|
"_type": "RegexTransformation",
|
||||||
|
"_regex": "Published: (.*)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"_type": "ParseDateTransformation",
|
||||||
|
"_formatProvider": "de-DE",
|
||||||
|
"_dateStyle": "None"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
{
|
||||||
|
"publishedDTNoGroupNameForceArrayFalse": "12-30-11",
|
||||||
|
"publishedDTWithGroupNameForceArrayFalse": "12-30-11",
|
||||||
|
"multiMatchNoGroupNameForceArrayFalse": {
|
||||||
|
"1": "111-111-111",
|
||||||
|
"2": "str.Street 1/1, City. 2017"
|
||||||
|
},
|
||||||
|
"multiMatchWithGroupNameForceArrayFalse": {
|
||||||
|
"phone": "111-111-111",
|
||||||
|
"address": "str.Street 1/1, City. 2017"
|
||||||
|
},
|
||||||
|
"publishedDTNoGroupNameForceArrayTrue": [
|
||||||
|
{
|
||||||
|
"1": [
|
||||||
|
"12 Juni 2008"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"publishedDTWithGroupNameForceArrayTrue": [
|
||||||
|
{
|
||||||
|
"date": [
|
||||||
|
"12 Juni 2008"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"multiMatchNoGroupNameForceArrayTrue": [
|
||||||
|
{
|
||||||
|
"1": [
|
||||||
|
"111-111-111"
|
||||||
|
],
|
||||||
|
"2": [
|
||||||
|
"str.Street 1/1, City. 2017"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"multiMatchWithGroupNameForceArrayTrue": [
|
||||||
|
{
|
||||||
|
"phone": [
|
||||||
|
"111-111-111"
|
||||||
|
],
|
||||||
|
"address": [
|
||||||
|
"str.Street 1/1, City. 2017"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"multiMatchWithSameGroupNameForceArrayTrue": [
|
||||||
|
{
|
||||||
|
"same_group": [
|
||||||
|
"111-111-111",
|
||||||
|
"str.Street 1/1, City. 2017"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
|
@ -0,0 +1,90 @@
|
||||||
|
{
|
||||||
|
"publishedDTNoGroupNameForceArrayFalse": {
|
||||||
|
"_xpath": "//p[@id='published-timestamp2']",
|
||||||
|
"_transformations": [
|
||||||
|
{
|
||||||
|
"_type": "RegexTransformation",
|
||||||
|
"_regex": "Published: (.*)"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"publishedDTWithGroupNameForceArrayFalse": {
|
||||||
|
"_xpath": "//p[@id='published-timestamp2']",
|
||||||
|
"_transformations": [
|
||||||
|
{
|
||||||
|
"_type": "RegexTransformation",
|
||||||
|
"_regex": "Published: (?<date>.*)",
|
||||||
|
"_regexOption": "IgnoreCase"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"multiMatchNoGroupNameForceArrayFalse": {
|
||||||
|
"_xpath": "//div[@class='info']",
|
||||||
|
"_transformations": [
|
||||||
|
{
|
||||||
|
"_type": "RegexTransformation",
|
||||||
|
"_regex": "^Contact information\\. Phone: ([0-9-]+), Address: (.*)$",
|
||||||
|
"_regexOptions": [ "IgnoreCase", "CultureInvariant" ]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"multiMatchWithGroupNameForceArrayFalse": {
|
||||||
|
"_xpath": "//div[@class='info']",
|
||||||
|
"_transformations": [
|
||||||
|
{
|
||||||
|
"_type": "RegexTransformation",
|
||||||
|
"_regex": "^Contact information\\. Phone: (?<phone>[0-9-]+), Address: (?<address>.*)$"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"publishedDTNoGroupNameForceArrayTrue": {
|
||||||
|
"_xpath": "//p[@id='published-timestamp3']",
|
||||||
|
"_transformations": [
|
||||||
|
{
|
||||||
|
"_type": "RegexTransformation",
|
||||||
|
"_regex": "Published: (.*)",
|
||||||
|
"_forceArray": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"publishedDTWithGroupNameForceArrayTrue": {
|
||||||
|
"_xpath": "//p[@id='published-timestamp3']",
|
||||||
|
"_transformations": [
|
||||||
|
{
|
||||||
|
"_type": "RegexTransformation",
|
||||||
|
"_regex": "Published: (?<date>.*)",
|
||||||
|
"_forceArray": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"multiMatchNoGroupNameForceArrayTrue": {
|
||||||
|
"_xpath": "//div[@class='info']",
|
||||||
|
"_transformations": [
|
||||||
|
{
|
||||||
|
"_type": "RegexTransformation",
|
||||||
|
"_regex": "^Contact information\\. Phone: ([0-9-]+), Address: (.*)$",
|
||||||
|
"_forceArray": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"multiMatchWithGroupNameForceArrayTrue": {
|
||||||
|
"_xpath": "//div[@class='info']",
|
||||||
|
"_transformations": [
|
||||||
|
{
|
||||||
|
"_type": "RegexTransformation",
|
||||||
|
"_regex": "^Contact information\\. Phone: (?<phone>[0-9-]+), Address: (?<address>.*)$",
|
||||||
|
"_forceArray": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"multiMatchWithSameGroupNameForceArrayTrue": {
|
||||||
|
"_xpath": "//div[@class='info']",
|
||||||
|
"_transformations": [
|
||||||
|
{
|
||||||
|
"_type": "RegexTransformation",
|
||||||
|
"_regex": "^Contact information\\. Phone: (?<same_group>[0-9-]+), Address: (?<same_group>.*)$",
|
||||||
|
"_forceArray": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
|
@ -5,7 +5,7 @@ VisualStudioVersion = 15.0.26730.8
|
||||||
MinimumVisualStudioVersion = 10.0.40219.1
|
MinimumVisualStudioVersion = 10.0.40219.1
|
||||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenScraping", "OpenScraping\OpenScraping.csproj", "{C38AA240-58C8-4081-BFA1-2818DE64A583}"
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenScraping", "OpenScraping\OpenScraping.csproj", "{C38AA240-58C8-4081-BFA1-2818DE64A583}"
|
||||||
EndProject
|
EndProject
|
||||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenScraping.Tests", "OpenScraping.Tests\OpenScraping.Tests.csproj", "{AB0BE337-12A9-4807-96F5-73854A81C2D8}"
|
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "OpenScraping.Tests", "OpenScraping.Tests\OpenScraping.Tests.csproj", "{AB0BE337-12A9-4807-96F5-73854A81C2D8}"
|
||||||
EndProject
|
EndProject
|
||||||
Global
|
Global
|
||||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
|
|
|
@ -8,13 +8,11 @@
|
||||||
<PackageProjectUrl>https://github.com/Microsoft/openscraping-lib-csharp</PackageProjectUrl>
|
<PackageProjectUrl>https://github.com/Microsoft/openscraping-lib-csharp</PackageProjectUrl>
|
||||||
<RepositoryType>https://github.com/Microsoft/openscraping-lib-csharp</RepositoryType>
|
<RepositoryType>https://github.com/Microsoft/openscraping-lib-csharp</RepositoryType>
|
||||||
<PackageTags>html extraction scraping scraper parser parsing open scraping openscraping</PackageTags>
|
<PackageTags>html extraction scraping scraper parser parsing open scraping openscraping</PackageTags>
|
||||||
<PackageReleaseNotes>- Added support for _removeXPaths config key, which allows removing some child nodes based on xPath rules BEFORE we process the main parent node. Useful for example when extracting a news article that contains divs that we want to remove BEFORE we extract the body of an article.
|
<PackageReleaseNotes>Added support for Regex transformation and ParseDateTransformation</PackageReleaseNotes>
|
||||||
- Updated some dependencies like HtmlAgilityPack to the latest stable version.
|
<AssemblyVersion>1.3.0.0</AssemblyVersion>
|
||||||
- Simplified reading configs and added support for specifying some config keys as either singular or plural, and setting their values to either a single value or an array.</PackageReleaseNotes>
|
<FileVersion>1.3.0.0</FileVersion>
|
||||||
<AssemblyVersion>1.2.0.0</AssemblyVersion>
|
|
||||||
<FileVersion>1.2.0.0</FileVersion>
|
|
||||||
<PackageIconUrl>https://github.com/Microsoft/openscraping-lib-csharp/blob/master/logo.png?raw=true</PackageIconUrl>
|
<PackageIconUrl>https://github.com/Microsoft/openscraping-lib-csharp/blob/master/logo.png?raw=true</PackageIconUrl>
|
||||||
<Version>1.2.0</Version>
|
<Version>1.3.0</Version>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="HtmlAgilityPack" Version="1.8.10" />
|
<PackageReference Include="HtmlAgilityPack" Version="1.8.10" />
|
||||||
|
|
|
@ -41,12 +41,12 @@ namespace OpenScraping.Transformations
|
||||||
var maxLevel = 3;
|
var maxLevel = 3;
|
||||||
var maxTitleLength = 200;
|
var maxTitleLength = 200;
|
||||||
|
|
||||||
if (settings != null && settings["_maxStepsUpward"] != null && ((JValue)settings["_maxStepsUpward"]).Type == JTokenType.Integer)
|
if (settings != null && settings.ContainsKey("_maxStepsUpward") && ((JValue)settings["_maxStepsUpward"]).Type == JTokenType.Integer)
|
||||||
{
|
{
|
||||||
maxLevel = ((JValue)settings["_maxStepsUpward"]).ToObject<int>();
|
maxLevel = ((JValue)settings["_maxStepsUpward"]).ToObject<int>();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (settings != null && settings["_maxTitleLength"] != null && ((JValue)settings["_maxTitleLength"]).Type == JTokenType.Integer)
|
if (settings != null && settings.ContainsKey("_maxTitleLength") && ((JValue)settings["_maxTitleLength"]).Type == JTokenType.Integer)
|
||||||
{
|
{
|
||||||
maxTitleLength = ((JValue)settings["_maxTitleLength"]).ToObject<int>();
|
maxTitleLength = ((JValue)settings["_maxTitleLength"]).ToObject<int>();
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,81 @@
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
// <copyright file="ParseDateTransformation.cs" company="Microsoft">
|
||||||
|
// Copyright (c) Microsoft. All rights reserved.
|
||||||
|
// </copyright>
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
namespace OpenScraping.Transformations
|
||||||
|
{
|
||||||
|
using Newtonsoft.Json.Linq;
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Globalization;
|
||||||
|
|
||||||
|
public class ParseDateTransformation : ITransformationFromObject, ITransformationFromHtml
|
||||||
|
{
|
||||||
|
public object Transform(Dictionary<string, object> settings, object input)
|
||||||
|
{
|
||||||
|
if (input != null && input is string)
|
||||||
|
{
|
||||||
|
var rawDate = (string)input;
|
||||||
|
return ParseDate(settings, rawDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public object Transform(Dictionary<string, object> settings, HtmlAgilityPack.HtmlNode node, List<HtmlAgilityPack.HtmlNode> logicalParents)
|
||||||
|
{
|
||||||
|
if (node != null)
|
||||||
|
{
|
||||||
|
var rawDate = node.InnerText;
|
||||||
|
return ParseDate(settings, rawDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private object ParseDate(Dictionary<string, object> settings, string rawDate)
|
||||||
|
{
|
||||||
|
string format = null;
|
||||||
|
var formatProvider = CultureInfo.InvariantCulture;
|
||||||
|
var dateStyle = DateTimeStyles.None;
|
||||||
|
|
||||||
|
if (settings != null)
|
||||||
|
{
|
||||||
|
if (settings.ContainsKey("_format") && ((JValue)settings["_format"]).Type == JTokenType.String)
|
||||||
|
{
|
||||||
|
format = settings["_format"].ToString();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (settings.ContainsKey("_formatProvider") && ((JValue)settings["_formatProvider"]).Type == JTokenType.String)
|
||||||
|
{
|
||||||
|
var rawFormatProvider = settings["_formatProvider"].ToString();
|
||||||
|
formatProvider = new CultureInfo(rawFormatProvider);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (settings.ContainsKey("_dateStyle") && ((JValue)settings["_dateStyle"]).Type == JTokenType.String)
|
||||||
|
{
|
||||||
|
var rawDateStyle = settings["_dateStyle"].ToString();
|
||||||
|
dateStyle = (DateTimeStyles)Enum.Parse(typeof(DateTimeStyles), rawDateStyle);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (format != null)
|
||||||
|
{
|
||||||
|
format = settings["_format"].ToString();
|
||||||
|
|
||||||
|
if (DateTime.TryParseExact(rawDate, format, formatProvider, dateStyle, out DateTime date))
|
||||||
|
{
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (DateTime.TryParse(rawDate, formatProvider, dateStyle, out DateTime date))
|
||||||
|
{
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,117 @@
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
// <copyright file="RegexTransformation.cs" company="Microsoft">
|
||||||
|
// Copyright (c) Microsoft. All rights reserved.
|
||||||
|
// </copyright>
|
||||||
|
// -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
namespace OpenScraping.Transformations
|
||||||
|
{
|
||||||
|
using Newtonsoft.Json.Linq;
|
||||||
|
using System;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Text.RegularExpressions;
|
||||||
|
|
||||||
|
public class RegexTransformation : ITransformationFromHtml
|
||||||
|
{
|
||||||
|
public object Transform(Dictionary<string, object> settings, HtmlAgilityPack.HtmlNode node, List<HtmlAgilityPack.HtmlNode> logicalParents)
|
||||||
|
{
|
||||||
|
if (node != null)
|
||||||
|
{
|
||||||
|
var text = node.InnerText;
|
||||||
|
|
||||||
|
if (!string.IsNullOrWhiteSpace(text))
|
||||||
|
{
|
||||||
|
object regexPatternObj;
|
||||||
|
|
||||||
|
if (!settings.TryGetValue("_regex", out regexPatternObj))
|
||||||
|
{
|
||||||
|
throw new ArgumentException("Could not find a _regex setting");
|
||||||
|
}
|
||||||
|
|
||||||
|
var forceArray = false;
|
||||||
|
|
||||||
|
object forceArrayObj;
|
||||||
|
|
||||||
|
if (settings.TryGetValue("_forceArray", out forceArrayObj))
|
||||||
|
{
|
||||||
|
forceArray = bool.Parse(forceArrayObj.ToString());
|
||||||
|
}
|
||||||
|
|
||||||
|
var regexOptions = RegexOptions.None;
|
||||||
|
|
||||||
|
if (settings.ContainsKey("_regexOption") && ((JToken)settings["_regexOption"]).Type == JTokenType.String)
|
||||||
|
{
|
||||||
|
var rawOption = settings["_regexOption"].ToString();
|
||||||
|
regexOptions = (RegexOptions)Enum.Parse(typeof(RegexOptions), rawOption);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (settings.ContainsKey("_regexOptions") && ((JToken)settings["_regexOptions"]).Type == JTokenType.Array)
|
||||||
|
{
|
||||||
|
var rawOptions = (JArray)settings["_regexOptions"];
|
||||||
|
|
||||||
|
foreach (var rawOption in rawOptions)
|
||||||
|
{
|
||||||
|
regexOptions = regexOptions | (RegexOptions)Enum.Parse(typeof(RegexOptions), (string)rawOption);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var regexPattern = regexPatternObj.ToString();
|
||||||
|
|
||||||
|
var regex = new Regex(regexPattern, regexOptions);
|
||||||
|
var matches = regex.Matches(text);
|
||||||
|
|
||||||
|
if (!forceArray
|
||||||
|
&& matches.Count == 1
|
||||||
|
&& matches[0].Groups.Count == 2)
|
||||||
|
{
|
||||||
|
|
||||||
|
return matches[0].Groups[1].Value;
|
||||||
|
}
|
||||||
|
|
||||||
|
var returnedMatches = new List<JObject>();
|
||||||
|
|
||||||
|
foreach (Match match in matches)
|
||||||
|
{
|
||||||
|
var returnedMatch = new JObject();
|
||||||
|
|
||||||
|
// Ignore first group
|
||||||
|
for (var i = 1; i < match.Groups.Count; i++)
|
||||||
|
{
|
||||||
|
var group = match.Groups[i];
|
||||||
|
var groupName = regex.GroupNameFromNumber(i);
|
||||||
|
|
||||||
|
if (!forceArray && group.Captures.Count == 1)
|
||||||
|
{
|
||||||
|
returnedMatch[groupName] = group.Value;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
var captures = new List<string>();
|
||||||
|
|
||||||
|
foreach (var capture in group.Captures)
|
||||||
|
{
|
||||||
|
captures.Add(capture.ToString());
|
||||||
|
}
|
||||||
|
|
||||||
|
returnedMatch[groupName] = new JArray(captures);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
returnedMatches.Add(returnedMatch);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!forceArray
|
||||||
|
&& returnedMatches.Count == 1)
|
||||||
|
{
|
||||||
|
return returnedMatches[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
return new JArray(returnedMatches);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -24,12 +24,12 @@ namespace OpenScraping.Transformations
|
||||||
|
|
||||||
if (!string.IsNullOrWhiteSpace(text))
|
if (!string.IsNullOrWhiteSpace(text))
|
||||||
{
|
{
|
||||||
if (settings != null && settings["_separator"] != null && ((JValue)settings["_separator"]).Type == JTokenType.String)
|
if (settings != null && settings.ContainsKey("_separator") && ((JValue)settings["_separator"]).Type == JTokenType.String)
|
||||||
{
|
{
|
||||||
separator = settings["_separator"].ToString();
|
separator = settings["_separator"].ToString();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (settings != null && settings["_trim"] != null && ((JValue)settings["_trim"]).Type == JTokenType.Boolean)
|
if (settings != null && settings.ContainsKey("_trim") && ((JValue)settings["_trim"]).Type == JTokenType.Boolean)
|
||||||
{
|
{
|
||||||
trim = (bool)((JValue)settings["_trim"]).Value;
|
trim = (bool)((JValue)settings["_trim"]).Value;
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,7 +26,7 @@ namespace OpenScraping.Transformations
|
||||||
var grandParentNode = logicalParents[logicalParents.Count - 2];
|
var grandParentNode = logicalParents[logicalParents.Count - 2];
|
||||||
HtmlAgilityPack.HtmlNode parentNode = grandParentNode;
|
HtmlAgilityPack.HtmlNode parentNode = grandParentNode;
|
||||||
|
|
||||||
if (settings != null && settings["_startingXPath"] != null && ((JValue)settings["_startingXPath"]).Type == JTokenType.String)
|
if (settings != null && settings.ContainsKey("_startingXPath") && ((JValue)settings["_startingXPath"]).Type == JTokenType.String)
|
||||||
{
|
{
|
||||||
var startingXPath = ((JValue)settings["_startingXPath"]).ToObject<string>();
|
var startingXPath = ((JValue)settings["_startingXPath"]).ToObject<string>();
|
||||||
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче