Fix PlainText's handling of open tags (#2691)

Fixes #2679 . Currently the PlainText doesn't handle some open tags correctly - the function is supposed to convert things like <br/> to a new line or <p> to two new lines, but it is matching tags such as <bra> and <para> which should not be matched.
This commit is contained in:
Carlos Figueira 2024-10-10 10:09:19 -07:00 коммит произвёл GitHub
Родитель d53c8425c1
Коммит ac3d51d049
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
2 изменённых файлов: 13 добавлений и 4 удалений

Просмотреть файл

@ -47,8 +47,8 @@ namespace Microsoft.PowerFx.Functions
private static readonly Regex _secondsDetokenizeRegex = new Regex("[\u0008][\u0008]+", RegExFlags); private static readonly Regex _secondsDetokenizeRegex = new Regex("[\u0008][\u0008]+", RegExFlags);
private static readonly Regex _milisecondsDetokenizeRegex = new Regex("[\u000e]+", RegExFlags); private static readonly Regex _milisecondsDetokenizeRegex = new Regex("[\u000e]+", RegExFlags);
private static readonly Regex _tdTagRegex = new Regex("<\\s*(td)[\\s\\S]*?\\/{0,1}>", RegExFlags_IgnoreCase); private static readonly Regex _tdTagRegex = new Regex("<\\s*(td)[\\s\\S]*?\\/{0,1}>", RegExFlags_IgnoreCase);
private static readonly Regex _lineBreakTagRegex = new Regex("<\\s*(br|li)[\\s\\S]*?\\/{0,1}>", RegExFlags_IgnoreCase); private static readonly Regex _lineBreakTagRegex = new Regex("<\\s*(br|li)((\\s+[\\s\\S]*?)|(\\s*\\/\\s*))?>", RegExFlags_IgnoreCase);
private static readonly Regex _doubleLineBreakTagRegex = new Regex("<\\s*(div|p|tr)[\\s\\S]*?\\/{0,1}>", RegExFlags_IgnoreCase); private static readonly Regex _doubleLineBreakTagRegex = new Regex("<\\s*(div|p|tr)((\\s+[\\s\\S]*?)|(\\s*\\/\\s*))?>", RegExFlags_IgnoreCase);
private static readonly Regex _commentTagRegex = new Regex("<!--[\\s\\S]*?--\\s*>", RegExFlags_IgnoreCase); private static readonly Regex _commentTagRegex = new Regex("<!--[\\s\\S]*?--\\s*>", RegExFlags_IgnoreCase);
private static readonly Regex _headerTagRegex = new Regex("<\\s*(header)[\\s\\S]*?>[\\s\\S]*?<\\s*\\/\\s*(header)\\s*>", RegExFlags_IgnoreCase); private static readonly Regex _headerTagRegex = new Regex("<\\s*(header)[\\s\\S]*?>[\\s\\S]*?<\\s*\\/\\s*(header)\\s*>", RegExFlags_IgnoreCase);
private static readonly Regex _scriptTagRegex = new Regex("<\\s*(script)[\\s\\S]*?>[\\s\\S]*?<\\s*\\/\\s*(script)\\s*>", RegExFlags_IgnoreCase); private static readonly Regex _scriptTagRegex = new Regex("<\\s*(script)[\\s\\S]*?>[\\s\\S]*?<\\s*\\/\\s*(script)\\s*>", RegExFlags_IgnoreCase);

Просмотреть файл

@ -1,4 +1,4 @@
>> PlainText("") >> PlainText("")
"" ""
>> PlainText("<>") >> PlainText("<>")
@ -97,4 +97,13 @@ Error({Kind:ErrorKind.Div0})
"1\r\n\r\n2\r\n\r\n3\r\n\r\n4" "1\r\n\r\n2\r\n\r\n3\r\n\r\n4"
>> PlainText("Hello&lt;br/&gt;world") >> PlainText("Hello&lt;br/&gt;world")
"Hello<br/>world" "Hello<br/>world"
>> PlainText("Many character entities: &lt;&gt;&amp;&quot;&apos;&cent;&pound;&yen;&euro;&copy;&reg;&aacute;&egrave;&otilde;&ccedil;&ucirc;&alpha;&beta;&gamma;&delta;&Delta;")
"Many character entities: <>&""'¢£¥€©®áèõçûαβγδΔ"
>> PlainText("More character references: &#X1f970; - &#x1F948;")
"More character references: 🥰 - 🥈"
>> PlainText("<para>Not a <break>line break.</para><para>Also not a line break.</para>")
"Not a line break.Also not a line break."