diff --git a/OpenScraping.Tests/OpenScraping.Tests.csproj b/OpenScraping.Tests/OpenScraping.Tests.csproj index fd88b21..0d04b20 100644 --- a/OpenScraping.Tests/OpenScraping.Tests.csproj +++ b/OpenScraping.Tests/OpenScraping.Tests.csproj @@ -23,6 +23,18 @@ PreserveNewest + + PreserveNewest + + + PreserveNewest + + + PreserveNewest + + + PreserveNewest + PreserveNewest diff --git a/OpenScraping.Tests/StructuredDataExtractionTests.cs b/OpenScraping.Tests/StructuredDataExtractionTests.cs index df33f5a..d0ef14a 100644 --- a/OpenScraping.Tests/StructuredDataExtractionTests.cs +++ b/OpenScraping.Tests/StructuredDataExtractionTests.cs @@ -6,13 +6,14 @@ namespace Microsoft.Search.StructuredDataExtraction.Tests { - using System.Globalization; - using System.IO; - using OpenScraping.Config; - using OpenScraping; + using Microsoft.VisualStudio.TestTools.UnitTesting; using Newtonsoft.Json; using Newtonsoft.Json.Linq; - using Microsoft.VisualStudio.TestTools.UnitTesting; + using OpenScraping; + using OpenScraping.Config; + using System; + using System.Globalization; + using System.IO; [TestClass] public class StructuredDataExtractionTests @@ -437,5 +438,37 @@ namespace Microsoft.Search.StructuredDataExtraction.Tests Assert.AreEqual("Para1 content Para2 content", parsedJson["body"].Value, "The extracted body is incorrect"); } + + [TestMethod] + public void RegexTest() + { + var configPath = Path.Combine("TestData", "regex_rules.json"); + var config = StructuredDataConfig.ParseJsonFile(configPath); + var extractor = new StructuredDataExtractor(config); + var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_date.html"))); + var actualJson = JsonConvert.SerializeObject(result, Formatting.Indented); + var parsedActualJson = JObject.Parse(actualJson); + + var expectedJsonPath = Path.Combine("TestData", "regex_expected_result.json"); + var expectedJson = File.ReadAllText(expectedJsonPath); + var parsedExpectedJson = JObject.Parse(expectedJson); + + Assert.IsTrue(JToken.DeepEquals(parsedActualJson, parsedExpectedJson)); + } + + [TestMethod] + public void ParseDateTest() + { + var configPath = Path.Combine("TestData", "parse_date_rules.json"); + var config = StructuredDataConfig.ParseJsonFile(configPath); + var extractor = new StructuredDataExtractor(config); + var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_date.html"))); + var json = JsonConvert.SerializeObject(result, Formatting.Indented); + dynamic parsedJson = JsonConvert.DeserializeObject(json); + + Assert.AreEqual(DateTime.Parse("2018-11-24T00:00:00"), parsedJson["parsedDateNoFormat"].Value); + Assert.AreEqual(DateTime.Parse("2011-12-30T00:00:00"), parsedJson["parsedDateWithFormat"].Value); + Assert.AreEqual(DateTime.Parse("2008-06-12T00:00:00"), parsedJson["parsedDateNoFormatWithProviderStyle"].Value); + } } } diff --git a/OpenScraping.Tests/TestData/article_with_date.html b/OpenScraping.Tests/TestData/article_with_date.html new file mode 100644 index 0000000..835323e --- /dev/null +++ b/OpenScraping.Tests/TestData/article_with_date.html @@ -0,0 +1,14 @@ + + + + + + Page title + + +

2018-11-24

+

Published: 12-30-11

+

Published: 12 Juni 2008

+
Contact information. Phone: 111-111-111, Address: str.Street 1/1, City. 2017
+ + \ No newline at end of file diff --git a/OpenScraping.Tests/TestData/parse_date_rules.json b/OpenScraping.Tests/TestData/parse_date_rules.json new file mode 100644 index 0000000..2744fe0 --- /dev/null +++ b/OpenScraping.Tests/TestData/parse_date_rules.json @@ -0,0 +1,33 @@ +{ + "parsedDateNoFormat": { + "_xpath": "//p[@id='published-timestamp1']", + "_transformation": "ParseDateTransformation" + }, + "parsedDateWithFormat": { + "_xpath": "//p[@id='published-timestamp2']", + "_transformations": [ + { + "_type": "RegexTransformation", + "_regex": "Published: (.*)" + }, + { + "_type": "ParseDateTransformation", + "_format": "MM-dd-yy" + } + ] + }, + "parsedDateNoFormatWithProviderStyle": { + "_xpath": "//p[@id='published-timestamp3']", + "_transformations": [ + { + "_type": "RegexTransformation", + "_regex": "Published: (.*)" + }, + { + "_type": "ParseDateTransformation", + "_formatProvider": "de-DE", + "_dateStyle": "None" + } + ] + } +} \ No newline at end of file diff --git a/OpenScraping.Tests/TestData/regex_expected_result.json b/OpenScraping.Tests/TestData/regex_expected_result.json new file mode 100644 index 0000000..567ada0 --- /dev/null +++ b/OpenScraping.Tests/TestData/regex_expected_result.json @@ -0,0 +1,54 @@ +{ + "publishedDTNoGroupNameForceArrayFalse": "12-30-11", + "publishedDTWithGroupNameForceArrayFalse": "12-30-11", + "multiMatchNoGroupNameForceArrayFalse": { + "1": "111-111-111", + "2": "str.Street 1/1, City. 2017" + }, + "multiMatchWithGroupNameForceArrayFalse": { + "phone": "111-111-111", + "address": "str.Street 1/1, City. 2017" + }, + "publishedDTNoGroupNameForceArrayTrue": [ + { + "1": [ + "12 Juni 2008" + ] + } + ], + "publishedDTWithGroupNameForceArrayTrue": [ + { + "date": [ + "12 Juni 2008" + ] + } + ], + "multiMatchNoGroupNameForceArrayTrue": [ + { + "1": [ + "111-111-111" + ], + "2": [ + "str.Street 1/1, City. 2017" + ] + } + ], + "multiMatchWithGroupNameForceArrayTrue": [ + { + "phone": [ + "111-111-111" + ], + "address": [ + "str.Street 1/1, City. 2017" + ] + } + ], + "multiMatchWithSameGroupNameForceArrayTrue": [ + { + "same_group": [ + "111-111-111", + "str.Street 1/1, City. 2017" + ] + } + ] +} \ No newline at end of file diff --git a/OpenScraping.Tests/TestData/regex_rules.json b/OpenScraping.Tests/TestData/regex_rules.json new file mode 100644 index 0000000..e7091a7 --- /dev/null +++ b/OpenScraping.Tests/TestData/regex_rules.json @@ -0,0 +1,90 @@ +{ + "publishedDTNoGroupNameForceArrayFalse": { + "_xpath": "//p[@id='published-timestamp2']", + "_transformations": [ + { + "_type": "RegexTransformation", + "_regex": "Published: (.*)" + } + ] + }, + "publishedDTWithGroupNameForceArrayFalse": { + "_xpath": "//p[@id='published-timestamp2']", + "_transformations": [ + { + "_type": "RegexTransformation", + "_regex": "Published: (?.*)", + "_regexOption": "IgnoreCase" + } + ] + }, + "multiMatchNoGroupNameForceArrayFalse": { + "_xpath": "//div[@class='info']", + "_transformations": [ + { + "_type": "RegexTransformation", + "_regex": "^Contact information\\. Phone: ([0-9-]+), Address: (.*)$", + "_regexOptions": [ "IgnoreCase", "CultureInvariant" ] + } + ] + }, + "multiMatchWithGroupNameForceArrayFalse": { + "_xpath": "//div[@class='info']", + "_transformations": [ + { + "_type": "RegexTransformation", + "_regex": "^Contact information\\. Phone: (?[0-9-]+), Address: (?
.*)$" + } + ] + }, + "publishedDTNoGroupNameForceArrayTrue": { + "_xpath": "//p[@id='published-timestamp3']", + "_transformations": [ + { + "_type": "RegexTransformation", + "_regex": "Published: (.*)", + "_forceArray": true + } + ] + }, + "publishedDTWithGroupNameForceArrayTrue": { + "_xpath": "//p[@id='published-timestamp3']", + "_transformations": [ + { + "_type": "RegexTransformation", + "_regex": "Published: (?.*)", + "_forceArray": true + } + ] + }, + "multiMatchNoGroupNameForceArrayTrue": { + "_xpath": "//div[@class='info']", + "_transformations": [ + { + "_type": "RegexTransformation", + "_regex": "^Contact information\\. Phone: ([0-9-]+), Address: (.*)$", + "_forceArray": true + } + ] + }, + "multiMatchWithGroupNameForceArrayTrue": { + "_xpath": "//div[@class='info']", + "_transformations": [ + { + "_type": "RegexTransformation", + "_regex": "^Contact information\\. Phone: (?[0-9-]+), Address: (?
.*)$", + "_forceArray": true + } + ] + }, + "multiMatchWithSameGroupNameForceArrayTrue": { + "_xpath": "//div[@class='info']", + "_transformations": [ + { + "_type": "RegexTransformation", + "_regex": "^Contact information\\. Phone: (?[0-9-]+), Address: (?.*)$", + "_forceArray": true + } + ] + } +} \ No newline at end of file diff --git a/OpenScraping.sln b/OpenScraping.sln index 2388450..c9defcf 100644 --- a/OpenScraping.sln +++ b/OpenScraping.sln @@ -5,7 +5,7 @@ VisualStudioVersion = 15.0.26730.8 MinimumVisualStudioVersion = 10.0.40219.1 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenScraping", "OpenScraping\OpenScraping.csproj", "{C38AA240-58C8-4081-BFA1-2818DE64A583}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenScraping.Tests", "OpenScraping.Tests\OpenScraping.Tests.csproj", "{AB0BE337-12A9-4807-96F5-73854A81C2D8}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "OpenScraping.Tests", "OpenScraping.Tests\OpenScraping.Tests.csproj", "{AB0BE337-12A9-4807-96F5-73854A81C2D8}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/OpenScraping/OpenScraping.csproj b/OpenScraping/OpenScraping.csproj index 53a75e8..33fd620 100644 --- a/OpenScraping/OpenScraping.csproj +++ b/OpenScraping/OpenScraping.csproj @@ -8,13 +8,11 @@ https://github.com/Microsoft/openscraping-lib-csharp https://github.com/Microsoft/openscraping-lib-csharp html extraction scraping scraper parser parsing open scraping openscraping - - Added support for _removeXPaths config key, which allows removing some child nodes based on xPath rules BEFORE we process the main parent node. Useful for example when extracting a news article that contains divs that we want to remove BEFORE we extract the body of an article. -- Updated some dependencies like HtmlAgilityPack to the latest stable version. -- Simplified reading configs and added support for specifying some config keys as either singular or plural, and setting their values to either a single value or an array. - 1.2.0.0 - 1.2.0.0 + Added support for Regex transformation and ParseDateTransformation + 1.3.0.0 + 1.3.0.0 https://github.com/Microsoft/openscraping-lib-csharp/blob/master/logo.png?raw=true - 1.2.0 + 1.3.0 diff --git a/OpenScraping/Transformations/ListTitleTransformation.cs b/OpenScraping/Transformations/ListTitleTransformation.cs index 0201b76..0f7b240 100644 --- a/OpenScraping/Transformations/ListTitleTransformation.cs +++ b/OpenScraping/Transformations/ListTitleTransformation.cs @@ -41,12 +41,12 @@ namespace OpenScraping.Transformations var maxLevel = 3; var maxTitleLength = 200; - if (settings != null && settings["_maxStepsUpward"] != null && ((JValue)settings["_maxStepsUpward"]).Type == JTokenType.Integer) + if (settings != null && settings.ContainsKey("_maxStepsUpward") && ((JValue)settings["_maxStepsUpward"]).Type == JTokenType.Integer) { maxLevel = ((JValue)settings["_maxStepsUpward"]).ToObject(); } - if (settings != null && settings["_maxTitleLength"] != null && ((JValue)settings["_maxTitleLength"]).Type == JTokenType.Integer) + if (settings != null && settings.ContainsKey("_maxTitleLength") && ((JValue)settings["_maxTitleLength"]).Type == JTokenType.Integer) { maxTitleLength = ((JValue)settings["_maxTitleLength"]).ToObject(); } diff --git a/OpenScraping/Transformations/ParseDateTransformation.cs b/OpenScraping/Transformations/ParseDateTransformation.cs new file mode 100644 index 0000000..97949f9 --- /dev/null +++ b/OpenScraping/Transformations/ParseDateTransformation.cs @@ -0,0 +1,81 @@ +// ----------------------------------------------------------------------- +// +// Copyright (c) Microsoft. All rights reserved. +// +// ----------------------------------------------------------------------- + +namespace OpenScraping.Transformations +{ + using Newtonsoft.Json.Linq; + using System; + using System.Collections.Generic; + using System.Globalization; + + public class ParseDateTransformation : ITransformationFromObject, ITransformationFromHtml + { + public object Transform(Dictionary settings, object input) + { + if (input != null && input is string) + { + var rawDate = (string)input; + return ParseDate(settings, rawDate); + } + + return null; + } + + public object Transform(Dictionary settings, HtmlAgilityPack.HtmlNode node, List logicalParents) + { + if (node != null) + { + var rawDate = node.InnerText; + return ParseDate(settings, rawDate); + } + + return null; + } + + private object ParseDate(Dictionary settings, string rawDate) + { + string format = null; + var formatProvider = CultureInfo.InvariantCulture; + var dateStyle = DateTimeStyles.None; + + if (settings != null) + { + if (settings.ContainsKey("_format") && ((JValue)settings["_format"]).Type == JTokenType.String) + { + format = settings["_format"].ToString(); + } + + if (settings.ContainsKey("_formatProvider") && ((JValue)settings["_formatProvider"]).Type == JTokenType.String) + { + var rawFormatProvider = settings["_formatProvider"].ToString(); + formatProvider = new CultureInfo(rawFormatProvider); + } + + if (settings.ContainsKey("_dateStyle") && ((JValue)settings["_dateStyle"]).Type == JTokenType.String) + { + var rawDateStyle = settings["_dateStyle"].ToString(); + dateStyle = (DateTimeStyles)Enum.Parse(typeof(DateTimeStyles), rawDateStyle); + } + } + + if (format != null) + { + format = settings["_format"].ToString(); + + if (DateTime.TryParseExact(rawDate, format, formatProvider, dateStyle, out DateTime date)) + { + return date; + } + } + else if (DateTime.TryParse(rawDate, formatProvider, dateStyle, out DateTime date)) + { + return date; + } + + return null; + } + } +} diff --git a/OpenScraping/Transformations/RegexTransformation.cs b/OpenScraping/Transformations/RegexTransformation.cs new file mode 100644 index 0000000..c46c059 --- /dev/null +++ b/OpenScraping/Transformations/RegexTransformation.cs @@ -0,0 +1,117 @@ +// ----------------------------------------------------------------------- +// +// Copyright (c) Microsoft. All rights reserved. +// +// ----------------------------------------------------------------------- + +namespace OpenScraping.Transformations +{ + using Newtonsoft.Json.Linq; + using System; + using System.Linq; + using System.Collections.Generic; + using System.Text.RegularExpressions; + + public class RegexTransformation : ITransformationFromHtml + { + public object Transform(Dictionary settings, HtmlAgilityPack.HtmlNode node, List logicalParents) + { + if (node != null) + { + var text = node.InnerText; + + if (!string.IsNullOrWhiteSpace(text)) + { + object regexPatternObj; + + if (!settings.TryGetValue("_regex", out regexPatternObj)) + { + throw new ArgumentException("Could not find a _regex setting"); + } + + var forceArray = false; + + object forceArrayObj; + + if (settings.TryGetValue("_forceArray", out forceArrayObj)) + { + forceArray = bool.Parse(forceArrayObj.ToString()); + } + + var regexOptions = RegexOptions.None; + + if (settings.ContainsKey("_regexOption") && ((JToken)settings["_regexOption"]).Type == JTokenType.String) + { + var rawOption = settings["_regexOption"].ToString(); + regexOptions = (RegexOptions)Enum.Parse(typeof(RegexOptions), rawOption); + } + + if (settings.ContainsKey("_regexOptions") && ((JToken)settings["_regexOptions"]).Type == JTokenType.Array) + { + var rawOptions = (JArray)settings["_regexOptions"]; + + foreach (var rawOption in rawOptions) + { + regexOptions = regexOptions | (RegexOptions)Enum.Parse(typeof(RegexOptions), (string)rawOption); + } + } + + var regexPattern = regexPatternObj.ToString(); + + var regex = new Regex(regexPattern, regexOptions); + var matches = regex.Matches(text); + + if (!forceArray + && matches.Count == 1 + && matches[0].Groups.Count == 2) + { + + return matches[0].Groups[1].Value; + } + + var returnedMatches = new List(); + + foreach (Match match in matches) + { + var returnedMatch = new JObject(); + + // Ignore first group + for (var i = 1; i < match.Groups.Count; i++) + { + var group = match.Groups[i]; + var groupName = regex.GroupNameFromNumber(i); + + if (!forceArray && group.Captures.Count == 1) + { + returnedMatch[groupName] = group.Value; + } + else + { + var captures = new List(); + + foreach (var capture in group.Captures) + { + captures.Add(capture.ToString()); + } + + returnedMatch[groupName] = new JArray(captures); + } + } + + returnedMatches.Add(returnedMatch); + } + + if (!forceArray + && returnedMatches.Count == 1) + { + return returnedMatches[0]; + } + + return new JArray(returnedMatches); + } + } + + return null; + } + } +} diff --git a/OpenScraping/Transformations/SplitTransformation.cs b/OpenScraping/Transformations/SplitTransformation.cs index 8e7b309..98bcfc1 100644 --- a/OpenScraping/Transformations/SplitTransformation.cs +++ b/OpenScraping/Transformations/SplitTransformation.cs @@ -24,12 +24,12 @@ namespace OpenScraping.Transformations if (!string.IsNullOrWhiteSpace(text)) { - if (settings != null && settings["_separator"] != null && ((JValue)settings["_separator"]).Type == JTokenType.String) + if (settings != null && settings.ContainsKey("_separator") && ((JValue)settings["_separator"]).Type == JTokenType.String) { separator = settings["_separator"].ToString(); } - if (settings != null && settings["_trim"] != null && ((JValue)settings["_trim"]).Type == JTokenType.Boolean) + if (settings != null && settings.ContainsKey("_trim") && ((JValue)settings["_trim"]).Type == JTokenType.Boolean) { trim = (bool)((JValue)settings["_trim"]).Value; } diff --git a/OpenScraping/Transformations/TotalTextLengthAboveListTransformation.cs b/OpenScraping/Transformations/TotalTextLengthAboveListTransformation.cs index f6baed8..29fe877 100644 --- a/OpenScraping/Transformations/TotalTextLengthAboveListTransformation.cs +++ b/OpenScraping/Transformations/TotalTextLengthAboveListTransformation.cs @@ -26,7 +26,7 @@ namespace OpenScraping.Transformations var grandParentNode = logicalParents[logicalParents.Count - 2]; HtmlAgilityPack.HtmlNode parentNode = grandParentNode; - if (settings != null && settings["_startingXPath"] != null && ((JValue)settings["_startingXPath"]).Type == JTokenType.String) + if (settings != null && settings.ContainsKey("_startingXPath") && ((JValue)settings["_startingXPath"]).Type == JTokenType.String) { var startingXPath = ((JValue)settings["_startingXPath"]).ToObject();