Add support for French decades with century (#3153)

* Draft commit

* Update regexes

* Fix DecodeRegex not defined before use for typescript

* Add testcase for "not able to recognize French in the 90s"

* Resolved review comments and added DateTimeModel specs

---------

Co-authored-by: Michael Wang (Centific Technologies Inc) <v-michwang@microsoft.com>
This commit is contained in:
Michael 2024-09-04 13:37:12 +08:00 коммит произвёл GitHub
Родитель 1b88159e35
Коммит cb8f16d8a1
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
6 изменённых файлов: 202 добавлений и 36 удалений

Просмотреть файл

@ -173,7 +173,9 @@ namespace Microsoft.Recognizers.Definitions.French
public static readonly string PeriodTimeOfDayWithDateRegex = $@"\b(({TimeOfDayRegex}))\b";
public const string LessThanRegex = @"^\b$";
public const string MoreThanRegex = @"^\b$";
public const string DurationUnitRegex = @"(?<unit>ann[eé]es?|ans?|mois|semaines?|jours?|heures?|hrs?|h|minutes?|mins?|secondes?|secs?|journ[eé]e)\b";
public const string DecadeRegex = @"(?<decade>(?:dix|vingt|trente|quarante|cinquante|soixante-dix|soixante|quatre-vingt-dix|quatre-vingts|deux\s+mille))";
public static readonly string DecadeWithCenturyInnerRegex = $@"(((?<century>\d|1\d|2\d)?((?<decade>\d0)\b)|(?<decade>\d0)(?=s))|(({CenturyRegex}(\s+)(et\s+)?)?{DecadeRegex})|({CenturyRegex}(\s+)(et\s+)?(?<decade>dix|centaines)))";
public static readonly string DurationUnitRegex = $@"(?<unit>\bann[eé]es?(?!\s+{DecadeWithCenturyInnerRegex})\b|ans?|mois|semaines?|jours?|heures?|hrs?|h|minutes?|mins?|secondes?|secs?|journ[eé]e)\b";
public const string SuffixAndRegex = @"(?<suffix>\s*(et)\s+(une?\s+)?(?<suffix_num>demi|quart))";
public const string PeriodicRegex = @"\b(?<periodic>quotidien(ne)?|journellement|mensuel(le)?|jours?|hebdomadaire|bihebdomadaire|annuel(lement)?)\b";
public static readonly string EachUnitRegex = $@"(?<each>(chaque|toutes les|tous les)(?<other>\s+autres)?\s*{DurationUnitRegex})";
@ -255,8 +257,7 @@ namespace Microsoft.Recognizers.Definitions.French
public const string NumberAsTimeRegex = @"^\b$";
public const string TimeBeforeAfterRegex = @"^\b$";
public const string DateNumberConnectorRegex = @"^\s*(?<connector>\s+[aà])\s*$";
public const string DecadeRegex = @"^\b$";
public const string DecadeWithCenturyRegex = @"^\b$";
public static readonly string DecadeWithCenturyRegex = $@"(les\s+)?(années)\s+{DecadeWithCenturyInnerRegex}";
public const string RelativeDecadeRegex = @"^\b$";
public static readonly string YearSuffix = $@"(,?(\s*à)?\s*({DateYearRegex}|{FullTextYearRegex}))";
public const string SuffixAfterRegex = @"^\b$";
@ -720,11 +721,19 @@ namespace Microsoft.Recognizers.Definitions.French
public const string NightRegex = @"\b(minuit|nuit)\b";
public static readonly Dictionary<string, int> WrittenDecades = new Dictionary<string, int>
{
{ @"", 0 }
{ @"dix", 10 },
{ @"vingt", 20 },
{ @"trente", 30 },
{ @"quarante", 40 },
{ @"cinquante", 50 },
{ @"soixante", 60 },
{ @"soixante-dix", 70 },
{ @"quatre-vingt", 80 },
{ @"quatre-vingt-dix", 90 }
};
public static readonly Dictionary<string, int> SpecialDecadeCases = new Dictionary<string, int>
{
{ @"", 0 }
{ @"deux mille", 2000 }
};
public const string DefaultLanguageFallback = @"DMY";
public static readonly string[] DurationDateRestrictions = { };

Просмотреть файл

@ -104,6 +104,7 @@ namespace Microsoft.Recognizers.Text.DateTime.French
SeasonMap = config.SeasonMap;
SpecialYearPrefixesMap = config.SpecialYearPrefixesMap;
WrittenDecades = config.WrittenDecades;
Numbers = config.Numbers;
SpecialDecadeCases = config.SpecialDecadeCases;
}

Просмотреть файл

@ -399,8 +399,14 @@ LessThanRegex: !simpleRegex
MoreThanRegex: !simpleRegex
# TODO: modify below regex according to the counterpart in English
def: ^\b$
DurationUnitRegex: !simpleRegex
def: (?<unit>ann[eé]es?|ans?|mois|semaines?|jours?|heures?|hrs?|h|minutes?|mins?|secondes?|secs?|journ[eé]e)\b
DecadeRegex: !simpleRegex
def: (?<decade>(?:dix|vingt|trente|quarante|cinquante|soixante-dix|soixante|quatre-vingt-dix|quatre-vingts|deux\s+mille))
DecadeWithCenturyInnerRegex: !nestedRegex
def: (((?<century>\d|1\d|2\d)?((?<decade>\d0)\b)|(?<decade>\d0)(?=s))|(({CenturyRegex}(\s+)(et\s+)?)?{DecadeRegex})|({CenturyRegex}(\s+)(et\s+)?(?<decade>dix|centaines)))
references: [ CenturyRegex, DecadeRegex ]
DurationUnitRegex: !nestedRegex
def: (?<unit>\bann[eé]es?(?!\s+{DecadeWithCenturyInnerRegex})\b|ans?|mois|semaines?|jours?|heures?|hrs?|h|minutes?|mins?|secondes?|secs?|journ[eé]e)\b
references: [ DecadeWithCenturyInnerRegex ]
SuffixAndRegex: !simpleRegex
def: (?<suffix>\s*(et)\s+(une?\s+)?(?<suffix_num>demi|quart))
PeriodicRegex: !simpleRegex
@ -592,12 +598,9 @@ TimeBeforeAfterRegex: !simpleRegex
def: ^\b$
DateNumberConnectorRegex: !simpleRegex
def: ^\s*(?<connector>\s+[aà])\s*$
DecadeRegex: !simpleRegex
# TODO: modify below regex according to the counterpart in English
def: ^\b$
DecadeWithCenturyRegex: !simpleRegex
# TODO: modify below regex according to the counterpart in English
def: ^\b$
DecadeWithCenturyRegex: !nestedRegex
def: (les\s+)?(années)\s+{DecadeWithCenturyInnerRegex}
references: [ DecadeWithCenturyInnerRegex ]
RelativeDecadeRegex: !simpleRegex
# TODO: modify below regex according to the counterpart in English
def: ^\b$
@ -1079,14 +1082,20 @@ NightRegex: !simpleRegex
def: \b(minuit|nuit)\b
WrittenDecades: !dictionary
types: [ string, int ]
# TODO: modify below dictionary according to the counterpart in English
entries:
'': 0
'dix': 10
'vingt': 20
'trente': 30
'quarante': 40
'cinquante': 50
'soixante': 60
'soixante-dix': 70
'quatre-vingt': 80
'quatre-vingt-dix': 90
SpecialDecadeCases: !dictionary
types: [ string, int ]
# TODO: modify below dictionary there're special cases for written decades
entries:
'': 0
'deux mille': 2000
DefaultLanguageFallback: DMY
DurationDateRestrictions: []
# Cases collected from mined data

Просмотреть файл

@ -4678,7 +4678,7 @@
},
{
"Input": "Dans les années 1970",
"NotSupported": "dotnet, javascript, python, java",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années 1970",
@ -4690,7 +4690,7 @@
},
{
"Input": "Dans les années 2000, il est né.",
"NotSupported": "dotnet, javascript, python, java",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années 2000",
@ -4726,7 +4726,7 @@
},
{
"Input": "Dans les années 70",
"NotSupported": "dotnet, javascript, python, java",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années 70",
@ -4737,11 +4737,11 @@
]
},
{
"Input": "Dans les années 40",
"NotSupported": "dotnet, javascript, python, java",
"Input": "Dans les années 20",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années 40",
"Text": "les années 20",
"Type": "daterange",
"Start": 5,
"Length": 13
@ -4750,7 +4750,7 @@
},
{
"Input": "Dans les années soixante-dix",
"NotSupported": "dotnet, javascript, python, java",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années soixante-dix",
@ -4762,7 +4762,7 @@
},
{
"Input": "Dans les années dix-neuf soixante-dix",
"NotSupported": "dotnet, javascript, python, java",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années dix-neuf soixante-dix",
@ -4772,6 +4772,30 @@
}
]
},
{
"Input": "Dans les années mille quatre cent vingt",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années mille quatre cent vingt",
"Type": "daterange",
"Start": 5,
"Length": 34
}
]
},
{
"Input": "Dans les années deux mille",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années deux mille",
"Type": "daterange",
"Start": 5,
"Length": 21
}
]
},
{
"Input": "Dans les deux mille dix",
"NotSupported": "dotnet, javascript, python, java",
@ -4786,7 +4810,7 @@
},
{
"Input": "Dans les années 2010",
"NotSupported": "dotnet, javascript, python, java",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années 2010",
@ -4810,7 +4834,7 @@
},
{
"Input": "Dans les années 2000",
"NotSupported": "dotnet, javascript, python, java",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années 2000",

Просмотреть файл

@ -4034,7 +4034,7 @@
},
{
"Input": "Dans les années 2000, il est né.",
"NotSupported": "dotnet, javascript, python, java",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années 2000",
@ -4057,7 +4057,7 @@
},
{
"Input": "Dans les années 1970's",
"NotSupported": "dotnet, javascript, python, java",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années 1970",
@ -4080,7 +4080,7 @@
},
{
"Input": "Dans les années 70s",
"NotSupported": "dotnet, javascript, python, java",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années 70",
@ -4103,7 +4103,7 @@
},
{
"Input": "Dans les années 70's",
"NotSupported": "dotnet, javascript, python, java",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années 70",
@ -4149,7 +4149,7 @@
},
{
"Input": "Dans les années 40",
"NotSupported": "dotnet, javascript, python, java",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années 40",
@ -4172,7 +4172,7 @@
},
{
"Input": "Dans les années soixante-dix",
"NotSupported": "dotnet, javascript, python, java",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années soixante-dix",
@ -4193,9 +4193,78 @@
}
]
},
{
"Input": "Dans les années dix-neuf soixante-dix",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années dix-neuf soixante-dix",
"Type": "daterange",
"Value": {
"Timex": "(1970-01-01,1980-01-01,P10Y)",
"FutureResolution": {
"startDate": "1970-01-01",
"endDate": "1980-01-01"
},
"PastResolution": {
"startDate": "1970-01-01",
"endDate": "1980-01-01"
}
},
"Start": 5,
"Length": 32
}
]
},
{
"Input": "Dans les années mille quatre cent vingt",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années mille quatre cent vingt",
"Type": "daterange",
"Value": {
"Timex": "(1420-01-01,1430-01-01,P10Y)",
"FutureResolution": {
"startDate": "1420-01-01",
"endDate": "1430-01-01"
},
"PastResolution": {
"startDate": "1420-01-01",
"endDate": "1430-01-01"
}
},
"Start": 5,
"Length": 34
}
]
},
{
"Input": "Dans les années deux mille",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années deux mille",
"Type": "daterange",
"Value": {
"Timex": "(2000-01-01,2010-01-01,P10Y)",
"FutureResolution": {
"startDate": "2000-01-01",
"endDate": "2010-01-01"
},
"PastResolution": {
"startDate": "2000-01-01",
"endDate": "2010-01-01"
}
},
"Start": 5,
"Length": 21
}
]
},
{
"Input": "Dans les années 1970",
"NotSupported": "dotnet, javascript, python, java",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années 1970",
@ -4241,7 +4310,7 @@
},
{
"Input": "Dans les années 2010",
"NotSupported": "dotnet, javascript, python, java",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années 2010",
@ -4287,7 +4356,7 @@
},
{
"Input": "Dans les années 2000",
"NotSupported": "dotnet, javascript, python, java",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années 2000",

Просмотреть файл

@ -3756,6 +3756,60 @@
}
]
},
{
"Input": "Qui était notre président dans les années 1990 ?",
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années 1990",
"Start": 31,
"End": 45,
"TypeName": "datetimeV2.daterange",
"Resolution": {
"values": [
{
"timex": "(1990-01-01,2000-01-01,P10Y)",
"type": "daterange",
"start": "1990-01-01",
"end": "2000-01-01"
}
]
}
}
]
},
{
"Input": "Qui étaient les présidents des États-Unis dans les années quatre-vingt-dix ?",
"Debug": true,
"Context": {
"ReferenceDateTime": "2018-05-29T00:00:00"
},
"NotSupported": "javascript, python, java",
"Results": [
{
"Text": "les années quatre-vingt-dix",
"Start": 47,
"End": 73,
"TypeName": "datetimeV2.daterange",
"Resolution": {
"values": [
{
"timex": "(XX90-01-01,XX00-01-01,P10Y)",
"type": "daterange",
"start": "1990-01-01",
"end": "2000-01-01"
},
{
"timex": "(XX90-01-01,XX00-01-01,P10Y)",
"type": "daterange",
"start": "2090-01-01",
"end": "2100-01-01"
}
]
}
}
]
},
{
"Input": "Je vais rentrer le 02/oct.",
"NotSupported": "dotnet, javascript, python, java",