diff --git a/OpenScraping.Tests/OpenScraping.Tests.csproj b/OpenScraping.Tests/OpenScraping.Tests.csproj
index fd88b21..0d04b20 100644
--- a/OpenScraping.Tests/OpenScraping.Tests.csproj
+++ b/OpenScraping.Tests/OpenScraping.Tests.csproj
@@ -23,6 +23,18 @@
PreserveNewest
+
+ PreserveNewest
+
+
+ PreserveNewest
+
+
+ PreserveNewest
+
+
+ PreserveNewest
+
PreserveNewest
diff --git a/OpenScraping.Tests/StructuredDataExtractionTests.cs b/OpenScraping.Tests/StructuredDataExtractionTests.cs
index df33f5a..d0ef14a 100644
--- a/OpenScraping.Tests/StructuredDataExtractionTests.cs
+++ b/OpenScraping.Tests/StructuredDataExtractionTests.cs
@@ -6,13 +6,14 @@
namespace Microsoft.Search.StructuredDataExtraction.Tests
{
- using System.Globalization;
- using System.IO;
- using OpenScraping.Config;
- using OpenScraping;
+ using Microsoft.VisualStudio.TestTools.UnitTesting;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
- using Microsoft.VisualStudio.TestTools.UnitTesting;
+ using OpenScraping;
+ using OpenScraping.Config;
+ using System;
+ using System.Globalization;
+ using System.IO;
[TestClass]
public class StructuredDataExtractionTests
@@ -437,5 +438,37 @@ namespace Microsoft.Search.StructuredDataExtraction.Tests
Assert.AreEqual("Para1 content Para2 content", parsedJson["body"].Value, "The extracted body is incorrect");
}
+
+ [TestMethod]
+ public void RegexTest()
+ {
+ var configPath = Path.Combine("TestData", "regex_rules.json");
+ var config = StructuredDataConfig.ParseJsonFile(configPath);
+ var extractor = new StructuredDataExtractor(config);
+ var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_date.html")));
+ var actualJson = JsonConvert.SerializeObject(result, Formatting.Indented);
+ var parsedActualJson = JObject.Parse(actualJson);
+
+ var expectedJsonPath = Path.Combine("TestData", "regex_expected_result.json");
+ var expectedJson = File.ReadAllText(expectedJsonPath);
+ var parsedExpectedJson = JObject.Parse(expectedJson);
+
+ Assert.IsTrue(JToken.DeepEquals(parsedActualJson, parsedExpectedJson));
+ }
+
+ [TestMethod]
+ public void ParseDateTest()
+ {
+ var configPath = Path.Combine("TestData", "parse_date_rules.json");
+ var config = StructuredDataConfig.ParseJsonFile(configPath);
+ var extractor = new StructuredDataExtractor(config);
+ var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_date.html")));
+ var json = JsonConvert.SerializeObject(result, Formatting.Indented);
+ dynamic parsedJson = JsonConvert.DeserializeObject(json);
+
+ Assert.AreEqual(DateTime.Parse("2018-11-24T00:00:00"), parsedJson["parsedDateNoFormat"].Value);
+ Assert.AreEqual(DateTime.Parse("2011-12-30T00:00:00"), parsedJson["parsedDateWithFormat"].Value);
+ Assert.AreEqual(DateTime.Parse("2008-06-12T00:00:00"), parsedJson["parsedDateNoFormatWithProviderStyle"].Value);
+ }
}
}
diff --git a/OpenScraping.Tests/TestData/article_with_date.html b/OpenScraping.Tests/TestData/article_with_date.html
new file mode 100644
index 0000000..835323e
--- /dev/null
+++ b/OpenScraping.Tests/TestData/article_with_date.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+ Page title
+
+
+ 2018-11-24
+ Published: 12-30-11
+ Published: 12 Juni 2008
+ Contact information. Phone: 111-111-111, Address: str.Street 1/1, City. 2017
+
+
\ No newline at end of file
diff --git a/OpenScraping.Tests/TestData/parse_date_rules.json b/OpenScraping.Tests/TestData/parse_date_rules.json
new file mode 100644
index 0000000..2744fe0
--- /dev/null
+++ b/OpenScraping.Tests/TestData/parse_date_rules.json
@@ -0,0 +1,33 @@
+{
+ "parsedDateNoFormat": {
+ "_xpath": "//p[@id='published-timestamp1']",
+ "_transformation": "ParseDateTransformation"
+ },
+ "parsedDateWithFormat": {
+ "_xpath": "//p[@id='published-timestamp2']",
+ "_transformations": [
+ {
+ "_type": "RegexTransformation",
+ "_regex": "Published: (.*)"
+ },
+ {
+ "_type": "ParseDateTransformation",
+ "_format": "MM-dd-yy"
+ }
+ ]
+ },
+ "parsedDateNoFormatWithProviderStyle": {
+ "_xpath": "//p[@id='published-timestamp3']",
+ "_transformations": [
+ {
+ "_type": "RegexTransformation",
+ "_regex": "Published: (.*)"
+ },
+ {
+ "_type": "ParseDateTransformation",
+ "_formatProvider": "de-DE",
+ "_dateStyle": "None"
+ }
+ ]
+ }
+}
\ No newline at end of file
diff --git a/OpenScraping.Tests/TestData/regex_expected_result.json b/OpenScraping.Tests/TestData/regex_expected_result.json
new file mode 100644
index 0000000..567ada0
--- /dev/null
+++ b/OpenScraping.Tests/TestData/regex_expected_result.json
@@ -0,0 +1,54 @@
+{
+ "publishedDTNoGroupNameForceArrayFalse": "12-30-11",
+ "publishedDTWithGroupNameForceArrayFalse": "12-30-11",
+ "multiMatchNoGroupNameForceArrayFalse": {
+ "1": "111-111-111",
+ "2": "str.Street 1/1, City. 2017"
+ },
+ "multiMatchWithGroupNameForceArrayFalse": {
+ "phone": "111-111-111",
+ "address": "str.Street 1/1, City. 2017"
+ },
+ "publishedDTNoGroupNameForceArrayTrue": [
+ {
+ "1": [
+ "12 Juni 2008"
+ ]
+ }
+ ],
+ "publishedDTWithGroupNameForceArrayTrue": [
+ {
+ "date": [
+ "12 Juni 2008"
+ ]
+ }
+ ],
+ "multiMatchNoGroupNameForceArrayTrue": [
+ {
+ "1": [
+ "111-111-111"
+ ],
+ "2": [
+ "str.Street 1/1, City. 2017"
+ ]
+ }
+ ],
+ "multiMatchWithGroupNameForceArrayTrue": [
+ {
+ "phone": [
+ "111-111-111"
+ ],
+ "address": [
+ "str.Street 1/1, City. 2017"
+ ]
+ }
+ ],
+ "multiMatchWithSameGroupNameForceArrayTrue": [
+ {
+ "same_group": [
+ "111-111-111",
+ "str.Street 1/1, City. 2017"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/OpenScraping.Tests/TestData/regex_rules.json b/OpenScraping.Tests/TestData/regex_rules.json
new file mode 100644
index 0000000..e7091a7
--- /dev/null
+++ b/OpenScraping.Tests/TestData/regex_rules.json
@@ -0,0 +1,90 @@
+{
+ "publishedDTNoGroupNameForceArrayFalse": {
+ "_xpath": "//p[@id='published-timestamp2']",
+ "_transformations": [
+ {
+ "_type": "RegexTransformation",
+ "_regex": "Published: (.*)"
+ }
+ ]
+ },
+ "publishedDTWithGroupNameForceArrayFalse": {
+ "_xpath": "//p[@id='published-timestamp2']",
+ "_transformations": [
+ {
+ "_type": "RegexTransformation",
+ "_regex": "Published: (?.*)",
+ "_regexOption": "IgnoreCase"
+ }
+ ]
+ },
+ "multiMatchNoGroupNameForceArrayFalse": {
+ "_xpath": "//div[@class='info']",
+ "_transformations": [
+ {
+ "_type": "RegexTransformation",
+ "_regex": "^Contact information\\. Phone: ([0-9-]+), Address: (.*)$",
+ "_regexOptions": [ "IgnoreCase", "CultureInvariant" ]
+ }
+ ]
+ },
+ "multiMatchWithGroupNameForceArrayFalse": {
+ "_xpath": "//div[@class='info']",
+ "_transformations": [
+ {
+ "_type": "RegexTransformation",
+ "_regex": "^Contact information\\. Phone: (?[0-9-]+), Address: (?.*)$"
+ }
+ ]
+ },
+ "publishedDTNoGroupNameForceArrayTrue": {
+ "_xpath": "//p[@id='published-timestamp3']",
+ "_transformations": [
+ {
+ "_type": "RegexTransformation",
+ "_regex": "Published: (.*)",
+ "_forceArray": true
+ }
+ ]
+ },
+ "publishedDTWithGroupNameForceArrayTrue": {
+ "_xpath": "//p[@id='published-timestamp3']",
+ "_transformations": [
+ {
+ "_type": "RegexTransformation",
+ "_regex": "Published: (?.*)",
+ "_forceArray": true
+ }
+ ]
+ },
+ "multiMatchNoGroupNameForceArrayTrue": {
+ "_xpath": "//div[@class='info']",
+ "_transformations": [
+ {
+ "_type": "RegexTransformation",
+ "_regex": "^Contact information\\. Phone: ([0-9-]+), Address: (.*)$",
+ "_forceArray": true
+ }
+ ]
+ },
+ "multiMatchWithGroupNameForceArrayTrue": {
+ "_xpath": "//div[@class='info']",
+ "_transformations": [
+ {
+ "_type": "RegexTransformation",
+ "_regex": "^Contact information\\. Phone: (?[0-9-]+), Address: (?.*)$",
+ "_forceArray": true
+ }
+ ]
+ },
+ "multiMatchWithSameGroupNameForceArrayTrue": {
+ "_xpath": "//div[@class='info']",
+ "_transformations": [
+ {
+ "_type": "RegexTransformation",
+ "_regex": "^Contact information\\. Phone: (?[0-9-]+), Address: (?.*)$",
+ "_forceArray": true
+ }
+ ]
+ }
+}
\ No newline at end of file
diff --git a/OpenScraping.sln b/OpenScraping.sln
index 2388450..c9defcf 100644
--- a/OpenScraping.sln
+++ b/OpenScraping.sln
@@ -5,7 +5,7 @@ VisualStudioVersion = 15.0.26730.8
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenScraping", "OpenScraping\OpenScraping.csproj", "{C38AA240-58C8-4081-BFA1-2818DE64A583}"
EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenScraping.Tests", "OpenScraping.Tests\OpenScraping.Tests.csproj", "{AB0BE337-12A9-4807-96F5-73854A81C2D8}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "OpenScraping.Tests", "OpenScraping.Tests\OpenScraping.Tests.csproj", "{AB0BE337-12A9-4807-96F5-73854A81C2D8}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
diff --git a/OpenScraping/OpenScraping.csproj b/OpenScraping/OpenScraping.csproj
index 53a75e8..33fd620 100644
--- a/OpenScraping/OpenScraping.csproj
+++ b/OpenScraping/OpenScraping.csproj
@@ -8,13 +8,11 @@
https://github.com/Microsoft/openscraping-lib-csharp
https://github.com/Microsoft/openscraping-lib-csharp
html extraction scraping scraper parser parsing open scraping openscraping
- - Added support for _removeXPaths config key, which allows removing some child nodes based on xPath rules BEFORE we process the main parent node. Useful for example when extracting a news article that contains divs that we want to remove BEFORE we extract the body of an article.
-- Updated some dependencies like HtmlAgilityPack to the latest stable version.
-- Simplified reading configs and added support for specifying some config keys as either singular or plural, and setting their values to either a single value or an array.
- 1.2.0.0
- 1.2.0.0
+ Added support for Regex transformation and ParseDateTransformation
+ 1.3.0.0
+ 1.3.0.0
https://github.com/Microsoft/openscraping-lib-csharp/blob/master/logo.png?raw=true
- 1.2.0
+ 1.3.0
diff --git a/OpenScraping/Transformations/ListTitleTransformation.cs b/OpenScraping/Transformations/ListTitleTransformation.cs
index 0201b76..0f7b240 100644
--- a/OpenScraping/Transformations/ListTitleTransformation.cs
+++ b/OpenScraping/Transformations/ListTitleTransformation.cs
@@ -41,12 +41,12 @@ namespace OpenScraping.Transformations
var maxLevel = 3;
var maxTitleLength = 200;
- if (settings != null && settings["_maxStepsUpward"] != null && ((JValue)settings["_maxStepsUpward"]).Type == JTokenType.Integer)
+ if (settings != null && settings.ContainsKey("_maxStepsUpward") && ((JValue)settings["_maxStepsUpward"]).Type == JTokenType.Integer)
{
maxLevel = ((JValue)settings["_maxStepsUpward"]).ToObject();
}
- if (settings != null && settings["_maxTitleLength"] != null && ((JValue)settings["_maxTitleLength"]).Type == JTokenType.Integer)
+ if (settings != null && settings.ContainsKey("_maxTitleLength") && ((JValue)settings["_maxTitleLength"]).Type == JTokenType.Integer)
{
maxTitleLength = ((JValue)settings["_maxTitleLength"]).ToObject();
}
diff --git a/OpenScraping/Transformations/ParseDateTransformation.cs b/OpenScraping/Transformations/ParseDateTransformation.cs
new file mode 100644
index 0000000..97949f9
--- /dev/null
+++ b/OpenScraping/Transformations/ParseDateTransformation.cs
@@ -0,0 +1,81 @@
+// -----------------------------------------------------------------------
+//
+// Copyright (c) Microsoft. All rights reserved.
+//
+// -----------------------------------------------------------------------
+
+namespace OpenScraping.Transformations
+{
+ using Newtonsoft.Json.Linq;
+ using System;
+ using System.Collections.Generic;
+ using System.Globalization;
+
+ public class ParseDateTransformation : ITransformationFromObject, ITransformationFromHtml
+ {
+ public object Transform(Dictionary settings, object input)
+ {
+ if (input != null && input is string)
+ {
+ var rawDate = (string)input;
+ return ParseDate(settings, rawDate);
+ }
+
+ return null;
+ }
+
+ public object Transform(Dictionary settings, HtmlAgilityPack.HtmlNode node, List logicalParents)
+ {
+ if (node != null)
+ {
+ var rawDate = node.InnerText;
+ return ParseDate(settings, rawDate);
+ }
+
+ return null;
+ }
+
+ private object ParseDate(Dictionary settings, string rawDate)
+ {
+ string format = null;
+ var formatProvider = CultureInfo.InvariantCulture;
+ var dateStyle = DateTimeStyles.None;
+
+ if (settings != null)
+ {
+ if (settings.ContainsKey("_format") && ((JValue)settings["_format"]).Type == JTokenType.String)
+ {
+ format = settings["_format"].ToString();
+ }
+
+ if (settings.ContainsKey("_formatProvider") && ((JValue)settings["_formatProvider"]).Type == JTokenType.String)
+ {
+ var rawFormatProvider = settings["_formatProvider"].ToString();
+ formatProvider = new CultureInfo(rawFormatProvider);
+ }
+
+ if (settings.ContainsKey("_dateStyle") && ((JValue)settings["_dateStyle"]).Type == JTokenType.String)
+ {
+ var rawDateStyle = settings["_dateStyle"].ToString();
+ dateStyle = (DateTimeStyles)Enum.Parse(typeof(DateTimeStyles), rawDateStyle);
+ }
+ }
+
+ if (format != null)
+ {
+ format = settings["_format"].ToString();
+
+ if (DateTime.TryParseExact(rawDate, format, formatProvider, dateStyle, out DateTime date))
+ {
+ return date;
+ }
+ }
+ else if (DateTime.TryParse(rawDate, formatProvider, dateStyle, out DateTime date))
+ {
+ return date;
+ }
+
+ return null;
+ }
+ }
+}
diff --git a/OpenScraping/Transformations/RegexTransformation.cs b/OpenScraping/Transformations/RegexTransformation.cs
new file mode 100644
index 0000000..c46c059
--- /dev/null
+++ b/OpenScraping/Transformations/RegexTransformation.cs
@@ -0,0 +1,117 @@
+// -----------------------------------------------------------------------
+//
+// Copyright (c) Microsoft. All rights reserved.
+//
+// -----------------------------------------------------------------------
+
+namespace OpenScraping.Transformations
+{
+ using Newtonsoft.Json.Linq;
+ using System;
+ using System.Linq;
+ using System.Collections.Generic;
+ using System.Text.RegularExpressions;
+
+ public class RegexTransformation : ITransformationFromHtml
+ {
+ public object Transform(Dictionary settings, HtmlAgilityPack.HtmlNode node, List logicalParents)
+ {
+ if (node != null)
+ {
+ var text = node.InnerText;
+
+ if (!string.IsNullOrWhiteSpace(text))
+ {
+ object regexPatternObj;
+
+ if (!settings.TryGetValue("_regex", out regexPatternObj))
+ {
+ throw new ArgumentException("Could not find a _regex setting");
+ }
+
+ var forceArray = false;
+
+ object forceArrayObj;
+
+ if (settings.TryGetValue("_forceArray", out forceArrayObj))
+ {
+ forceArray = bool.Parse(forceArrayObj.ToString());
+ }
+
+ var regexOptions = RegexOptions.None;
+
+ if (settings.ContainsKey("_regexOption") && ((JToken)settings["_regexOption"]).Type == JTokenType.String)
+ {
+ var rawOption = settings["_regexOption"].ToString();
+ regexOptions = (RegexOptions)Enum.Parse(typeof(RegexOptions), rawOption);
+ }
+
+ if (settings.ContainsKey("_regexOptions") && ((JToken)settings["_regexOptions"]).Type == JTokenType.Array)
+ {
+ var rawOptions = (JArray)settings["_regexOptions"];
+
+ foreach (var rawOption in rawOptions)
+ {
+ regexOptions = regexOptions | (RegexOptions)Enum.Parse(typeof(RegexOptions), (string)rawOption);
+ }
+ }
+
+ var regexPattern = regexPatternObj.ToString();
+
+ var regex = new Regex(regexPattern, regexOptions);
+ var matches = regex.Matches(text);
+
+ if (!forceArray
+ && matches.Count == 1
+ && matches[0].Groups.Count == 2)
+ {
+
+ return matches[0].Groups[1].Value;
+ }
+
+ var returnedMatches = new List();
+
+ foreach (Match match in matches)
+ {
+ var returnedMatch = new JObject();
+
+ // Ignore first group
+ for (var i = 1; i < match.Groups.Count; i++)
+ {
+ var group = match.Groups[i];
+ var groupName = regex.GroupNameFromNumber(i);
+
+ if (!forceArray && group.Captures.Count == 1)
+ {
+ returnedMatch[groupName] = group.Value;
+ }
+ else
+ {
+ var captures = new List();
+
+ foreach (var capture in group.Captures)
+ {
+ captures.Add(capture.ToString());
+ }
+
+ returnedMatch[groupName] = new JArray(captures);
+ }
+ }
+
+ returnedMatches.Add(returnedMatch);
+ }
+
+ if (!forceArray
+ && returnedMatches.Count == 1)
+ {
+ return returnedMatches[0];
+ }
+
+ return new JArray(returnedMatches);
+ }
+ }
+
+ return null;
+ }
+ }
+}
diff --git a/OpenScraping/Transformations/SplitTransformation.cs b/OpenScraping/Transformations/SplitTransformation.cs
index 8e7b309..98bcfc1 100644
--- a/OpenScraping/Transformations/SplitTransformation.cs
+++ b/OpenScraping/Transformations/SplitTransformation.cs
@@ -24,12 +24,12 @@ namespace OpenScraping.Transformations
if (!string.IsNullOrWhiteSpace(text))
{
- if (settings != null && settings["_separator"] != null && ((JValue)settings["_separator"]).Type == JTokenType.String)
+ if (settings != null && settings.ContainsKey("_separator") && ((JValue)settings["_separator"]).Type == JTokenType.String)
{
separator = settings["_separator"].ToString();
}
- if (settings != null && settings["_trim"] != null && ((JValue)settings["_trim"]).Type == JTokenType.Boolean)
+ if (settings != null && settings.ContainsKey("_trim") && ((JValue)settings["_trim"]).Type == JTokenType.Boolean)
{
trim = (bool)((JValue)settings["_trim"]).Value;
}
diff --git a/OpenScraping/Transformations/TotalTextLengthAboveListTransformation.cs b/OpenScraping/Transformations/TotalTextLengthAboveListTransformation.cs
index f6baed8..29fe877 100644
--- a/OpenScraping/Transformations/TotalTextLengthAboveListTransformation.cs
+++ b/OpenScraping/Transformations/TotalTextLengthAboveListTransformation.cs
@@ -26,7 +26,7 @@ namespace OpenScraping.Transformations
var grandParentNode = logicalParents[logicalParents.Count - 2];
HtmlAgilityPack.HtmlNode parentNode = grandParentNode;
- if (settings != null && settings["_startingXPath"] != null && ((JValue)settings["_startingXPath"]).Type == JTokenType.String)
+ if (settings != null && settings.ContainsKey("_startingXPath") && ((JValue)settings["_startingXPath"]).Type == JTokenType.String)
{
var startingXPath = ((JValue)settings["_startingXPath"]).ToObject();