[TextExtractor] Add space between CJK words and non-CJK (#20926)
* [TextExtractor] add space between CJK words and non-CJK
This commit is contained in:
Родитель
d4083abee2
Коммит
d17ac2bf79
|
@ -10,6 +10,7 @@ using System.Globalization;
|
|||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
using System.Threading.Tasks;
|
||||
using System.Windows;
|
||||
using System.Windows.Input;
|
||||
|
@ -146,11 +147,25 @@ internal class ImageMethods
|
|||
}
|
||||
else
|
||||
{
|
||||
var cjkRegex = new Regex(@"\p{IsCJKUnifiedIdeographs}");
|
||||
|
||||
foreach (OcrLine ocrLine in ocrResult.Lines)
|
||||
{
|
||||
bool isBeginning = true;
|
||||
bool isCJKPrev = false;
|
||||
foreach (OcrWord ocrWord in ocrLine.Words)
|
||||
{
|
||||
bool isCJK = cjkRegex.IsMatch(ocrWord.Text);
|
||||
|
||||
// Use spaces to separate non-CJK words.
|
||||
if (!isBeginning && (!isCJK || !isCJKPrev))
|
||||
{
|
||||
_ = text.Append(' ');
|
||||
}
|
||||
|
||||
_ = text.Append(ocrWord.Text);
|
||||
isCJKPrev = isCJK;
|
||||
isBeginning = false;
|
||||
}
|
||||
|
||||
text.Append(Environment.NewLine);
|
||||
|
|
Загрузка…
Ссылка в новой задаче