Bug 1782579 - Pass languages to the text recognition api; r=nordzilla,emilio

Differential Revision: https://phabricator.services.mozilla.com/D153668
This commit is contained in:
Greg Tatum 2022-08-10 16:08:46 +00:00
Родитель 57b038eab3
Коммит 980c8191a3
12 изменённых файлов: 281 добавлений и 126 удалений

Просмотреть файл

@ -52,6 +52,8 @@
#include "mozilla/dom/ImageTextBinding.h"
#include "mozilla/dom/ImageTracker.h"
#include "mozilla/dom/ScriptSettings.h"
#include "mozilla/intl/LocaleService.h"
#include "mozilla/intl/Locale.h"
#include "mozilla/net/UrlClassifierFeatureFactory.h"
#include "mozilla/widget/TextRecognition.h"
@ -1232,60 +1234,98 @@ already_AddRefed<Promise> nsImageLoadingContent::RecognizeCurrentImageText(
return nullptr;
}
TextRecognition::FindText(*image)->Then(
GetCurrentSerialEventTarget(), __func__,
[weak = RefPtr{do_GetWeakReference(this)},
request = RefPtr{mCurrentRequest}, domPromise](
TextRecognition::NativePromise::ResolveOrRejectValue&& aValue) {
if (aValue.IsReject()) {
domPromise->MaybeRejectWithNotSupportedError(aValue.RejectValue());
return;
}
RefPtr<nsIImageLoadingContent> iilc = do_QueryReferent(weak.get());
if (!iilc) {
domPromise->MaybeRejectWithInvalidStateError(
"Element was dead when we got the results");
return;
}
auto* ilc = static_cast<nsImageLoadingContent*>(iilc.get());
if (ilc->mCurrentRequest != request) {
domPromise->MaybeRejectWithInvalidStateError("Request not current");
return;
}
auto& textRecognitionResult = aValue.ResolveValue();
Element* el = ilc->AsContent()->AsElement();
// The list of ISO 639-1 language tags to pass to the text recognition API.
AutoTArray<nsCString, 4> languages;
{
// The document's locale should be the top language to use. Parse the BCP 47
// locale and extract the ISO 639-1 language tag. e.g. "en-US" -> "en".
nsAutoCString elementLanguage;
nsAtom* imgLanguage = AsContent()->GetLang();
intl::Locale locale;
if (imgLanguage) {
imgLanguage->ToUTF8String(elementLanguage);
auto result = intl::LocaleParser::TryParse(elementLanguage, locale);
if (result.isOk()) {
languages.AppendElement(locale.Language().Span());
}
}
}
// When enabled, this feature will place the recognized text as spans
// inside of the shadow dom of the img element. These are then
// positioned so that the user can select the text.
if (Preferences::GetBool("dom.text-recognition.shadow-dom-enabled",
false)) {
el->AttachAndSetUAShadowRoot(Element::NotifyUAWidgetSetup::Yes);
TextRecognition::FillShadow(*el->GetShadowRoot(),
textRecognitionResult);
el->NotifyUAWidgetSetupOrChange();
}
{
// The app locales should also be included after the document's locales.
// Extract the language tag like above.
nsTArray<nsCString> appLocales;
intl::LocaleService::GetInstance()->GetAppLocalesAsBCP47(appLocales);
nsTArray<ImageText> imageTexts(textRecognitionResult.quads().Length());
nsIGlobalObject* global = el->OwnerDoc()->GetOwnerGlobal();
for (const auto& localeString : appLocales) {
intl::Locale locale;
auto result = intl::LocaleParser::TryParse(localeString, locale);
if (result.isErr()) {
NS_WARNING("Could not parse an app locale string, ignoring it.");
continue;
}
languages.AppendElement(locale.Language().Span());
}
}
for (const auto& quad : textRecognitionResult.quads()) {
NotNull<ImageText*> imageText = imageTexts.AppendElement();
TextRecognition::FindText(*image, languages)
->Then(
GetCurrentSerialEventTarget(), __func__,
[weak = RefPtr{do_GetWeakReference(this)},
request = RefPtr{mCurrentRequest}, domPromise](
TextRecognition::NativePromise::ResolveOrRejectValue&& aValue) {
if (aValue.IsReject()) {
domPromise->MaybeRejectWithNotSupportedError(
aValue.RejectValue());
return;
}
RefPtr<nsIImageLoadingContent> iilc = do_QueryReferent(weak.get());
if (!iilc) {
domPromise->MaybeRejectWithInvalidStateError(
"Element was dead when we got the results");
return;
}
auto* ilc = static_cast<nsImageLoadingContent*>(iilc.get());
if (ilc->mCurrentRequest != request) {
domPromise->MaybeRejectWithInvalidStateError(
"Request not current");
return;
}
auto& textRecognitionResult = aValue.ResolveValue();
Element* el = ilc->AsContent()->AsElement();
// Note: These points are not actually CSSPixels, but a DOMQuad is
// a conveniently similar structure that can store these values.
CSSPoint points[4];
points[0] = CSSPoint(quad.points()[0].x, quad.points()[0].y);
points[1] = CSSPoint(quad.points()[1].x, quad.points()[1].y);
points[2] = CSSPoint(quad.points()[2].x, quad.points()[2].y);
points[3] = CSSPoint(quad.points()[3].x, quad.points()[3].y);
// When enabled, this feature will place the recognized text as
// spans inside of the shadow dom of the img element. These are then
// positioned so that the user can select the text.
if (Preferences::GetBool("dom.text-recognition.shadow-dom-enabled",
false)) {
el->AttachAndSetUAShadowRoot(Element::NotifyUAWidgetSetup::Yes);
TextRecognition::FillShadow(*el->GetShadowRoot(),
textRecognitionResult);
el->NotifyUAWidgetSetupOrChange();
}
imageText->mQuad = new DOMQuad(global, points);
imageText->mConfidence = quad.confidence();
imageText->mString = quad.string();
}
domPromise->MaybeResolve(std::move(imageTexts));
});
nsTArray<ImageText> imageTexts(
textRecognitionResult.quads().Length());
nsIGlobalObject* global = el->OwnerDoc()->GetOwnerGlobal();
for (const auto& quad : textRecognitionResult.quads()) {
NotNull<ImageText*> imageText = imageTexts.AppendElement();
// Note: These points are not actually CSSPixels, but a DOMQuad is
// a conveniently similar structure that can store these values.
CSSPoint points[4];
points[0] = CSSPoint(quad.points()[0].x, quad.points()[0].y);
points[1] = CSSPoint(quad.points()[1].x, quad.points()[1].y);
points[2] = CSSPoint(quad.points()[2].x, quad.points()[2].y);
points[3] = CSSPoint(quad.points()[3].x, quad.points()[3].y);
imageText->mQuad = new DOMQuad(global, points);
imageText->mConfidence = quad.confidence();
imageText->mString = quad.string();
}
domPromise->MaybeResolve(std::move(imageTexts));
});
return domPromise.forget();
}

Просмотреть файл

@ -4965,23 +4965,25 @@ mozilla::ipc::IPCResult ContentParent::RecvCopyFavicon(
}
mozilla::ipc::IPCResult ContentParent::RecvFindImageText(
ShmemImage&& aImage, FindImageTextResolver&& aResolver) {
ShmemImage&& aImage, nsTArray<nsCString>&& aLanguages,
FindImageTextResolver&& aResolver) {
RefPtr<DataSourceSurface> surf =
nsContentUtils::IPCImageToSurface(std::move(aImage), this);
if (!surf) {
aResolver(TextRecognitionResultOrError("Failed to read image"_ns));
return IPC_OK();
}
TextRecognition::FindText(*surf)->Then(
GetCurrentSerialEventTarget(), __func__,
[resolver = std::move(aResolver)](
TextRecognition::NativePromise::ResolveOrRejectValue&& aValue) {
if (aValue.IsResolve()) {
resolver(TextRecognitionResultOrError(aValue.ResolveValue()));
} else {
resolver(TextRecognitionResultOrError(aValue.RejectValue()));
}
});
TextRecognition::FindText(*surf, aLanguages)
->Then(
GetCurrentSerialEventTarget(), __func__,
[resolver = std::move(aResolver)](
TextRecognition::NativePromise::ResolveOrRejectValue&& aValue) {
if (aValue.IsResolve()) {
resolver(TextRecognitionResultOrError(aValue.ResolveValue()));
} else {
resolver(TextRecognitionResultOrError(aValue.RejectValue()));
}
});
return IPC_OK();
}

Просмотреть файл

@ -1110,7 +1110,7 @@ class ContentParent final : public PContentParent,
mozilla::ipc::IPCResult RecvCopyFavicon(nsIURI* aOldURI, nsIURI* aNewURI,
const bool& aInPrivateBrowsing);
mozilla::ipc::IPCResult RecvFindImageText(ShmemImage&&,
mozilla::ipc::IPCResult RecvFindImageText(ShmemImage&&, nsTArray<nsCString>&&,
FindImageTextResolver&&);
virtual void ProcessingError(Result aCode, const char* aMsgName) override;

Просмотреть файл

@ -1237,7 +1237,8 @@ parent:
async CopyFavicon(nsIURI oldURI, nsIURI newURI, bool isPrivate);
async FindImageText(ShmemImage image) returns (TextRecognitionResultOrError result);
async FindImageText(ShmemImage image, nsCString[] languages)
returns (TextRecognitionResultOrError result);
/**
* Notifies the parent about a recording device is starting or shutdown.

Просмотреть файл

@ -2,6 +2,7 @@
skip-if = os == 'android'
support-files =
image.png
image-zh.png
tree_shared.js
popup_shared.js
window_label_checkbox.xhtml
@ -15,6 +16,10 @@ skip-if = os == 'linux' # Bug 1115088
[test_editor_currentURI.xhtml]
[test_image_recognition.html]
run-if = os == "mac" # Mac only feature.
[test_image_recognition_unsupported.html]
skip-if = os == 'mac'
[test_image_recognition_zh.html]
run-if = os == "mac" && os_version != "10.15" # Mac only feature, requires > 10.15 to support multilingual results.
[test_label_checkbox.xhtml]
[test_menubar.xhtml]
skip-if = os == 'mac'

Двоичные данные
toolkit/content/tests/widgets/image-zh.png Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 5.3 KiB

Просмотреть файл

@ -4,8 +4,8 @@
<title>Image recognition test</title>
<script src="chrome://mochikit/content/tests/SimpleTest/SimpleTest.js"></script>
<script src="chrome://mochikit/content/tests/SimpleTest/EventUtils.js"></script>
<script type="text/javascript" src="head.js"></script>
<link rel="stylesheet" type="text/css" href="chrome://mochikit/content/tests/SimpleTest/test.css" />
<script src="head.js"></script>
<link rel="stylesheet" href="chrome://mochikit/content/tests/SimpleTest/test.css" />
</head>
<body>
<p id="display"></p>
@ -15,11 +15,10 @@
</div>
<pre id="test">
<script class="testbody" type="text/javascript">
<script class="testbody">
const { TestUtils } = ChromeUtils.import(
"resource://testing-common/TestUtils.jsm"
);
SimpleTest.waitForExplicitFinish();
function pushPref(preferenceName, value) {
return new Promise(resolve => {
@ -28,41 +27,39 @@
});
}
window.addEventListener("load", async () => {
try {
await pushPref("dom.text-recognition.shadow-dom-enabled", true);
const img = document.querySelector("#content img");
add_task(async () => {
// Performing text recognition in CI can take some time, and test verify runs have
// timed out.
SimpleTest.requestLongerTimeout(2);
info("Recognizing the image text");
const result = await img.recognizeCurrentImageText();
is(result.length, 2, "Two words were found.");
const mozilla = result.find(r => r.string === "Mozilla");
const firefox = result.find(r => r.string === "Firefox");
await pushPref("dom.text-recognition.shadow-dom-enabled", true);
const img = document.querySelector("#content img");
ok(mozilla, "The word Mozilla was found.");
ok(firefox, "The word Firefox was found.");
info("Recognizing the image text");
const result = await img.recognizeCurrentImageText();
is(result.length, 2, "Two words were found.");
const mozilla = result.find(r => r.string === "Mozilla");
const firefox = result.find(r => r.string === "Firefox");
ok(mozilla.quad.p1.x < firefox.quad.p2.x, "The Mozilla text is left of Firefox");
ok(mozilla.quad.p1.y > firefox.quad.p2.y, "The Mozilla text is above Firefox");
ok(mozilla, "The word Mozilla was found.");
ok(firefox, "The word Firefox was found.");
const spans = await TestUtils.waitForCondition(
() => shadowRootQuerySelectorAll(img, "span"),
"Attempting to get image recognition spans."
);
ok(mozilla.quad.p1.x < firefox.quad.p2.x, "The Mozilla text is left of Firefox");
ok(mozilla.quad.p1.y > firefox.quad.p2.y, "The Mozilla text is above Firefox");
const mozillaSpan = [...spans].find(s => s.innerText === "Mozilla");
const firefoxSpan = [...spans].find(s => s.innerText === "Firefox");
const spans = await TestUtils.waitForCondition(
() => shadowRootQuerySelectorAll(img, "span"),
"Attempting to get image recognition spans."
);
ok(mozillaSpan, "The word Mozilla span was found.");
ok(firefoxSpan, "The word Firefox span was found.");
const mozillaSpan = [...spans].find(s => s.innerText === "Mozilla");
const firefoxSpan = [...spans].find(s => s.innerText === "Firefox");
ok(mozillaSpan.style.transform.startsWith("matrix3d("), "A matrix transform was applied");
ok(firefoxSpan.style.transform.startsWith("matrix3d("), "A matrix transform was applied");
} catch (error) {
ok(false, error.message + '\n' + error.stack);
}
ok(mozillaSpan, "The word Mozilla span was found.");
ok(firefoxSpan, "The word Firefox span was found.");
SimpleTest.finish()
ok(mozillaSpan.style.transform.startsWith("matrix3d("), "A matrix transform was applied");
ok(firefoxSpan.style.transform.startsWith("matrix3d("), "A matrix transform was applied");
});
</script>
</pre>

Просмотреть файл

@ -0,0 +1,36 @@
<!DOCTYPE HTML>
<html>
<head>
<title>Image recognition unsupported</title>
<script src="chrome://mochikit/content/tests/SimpleTest/SimpleTest.js"></script>
<script src="chrome://mochikit/content/tests/SimpleTest/EventUtils.js"></script>
<script src="head.js"></script>
<link rel="stylesheet" href="chrome://mochikit/content/tests/SimpleTest/test.css" />
</head>
<body>
<p id="display"></p>
<div id="content">
<img src="image.png" />
</div>
<pre id="test">
<script class="testbody">
/**
* This test is for platforms that do not support text recognition.
*/
add_task(async () => {
const img = document.querySelector("#content img");
info("Recognizing the current image text is not supported on this platform.");
try {
await img.recognizeCurrentImageText();
ok(false, "Recognizing the text should not be supported.");
} catch (error) {
ok(error, "Expected unsupported message: " + error.message);
}
});
</script>
</pre>
</body>
</html>

Просмотреть файл

@ -0,0 +1,53 @@
<!DOCTYPE HTML>
<html>
<head>
<title>Image recognition test for Chinese</title>
<script src="chrome://mochikit/content/tests/SimpleTest/SimpleTest.js"></script>
<script src="chrome://mochikit/content/tests/SimpleTest/EventUtils.js"></script>
<script src="head.js"></script>
<link rel="stylesheet" href="chrome://mochikit/content/tests/SimpleTest/test.css" />
</head>
<body>
<p id="display"></p>
<div id="content">
<img src="image-zh.png" />
</div>
<pre id="test">
<script class="testbody">
// Performing text recognition in CI can take some time, and test verify runs have
// timed out.
SimpleTest.requestLongerTimeout(2);
/**
* This test exercises the code path where the image recognition APIs detect the
* document language and use it to choose the language.
*/
add_task(async () => {
const img = document.querySelector("#content img");
info("Recognizing the image text, but not as Chinese");
{
const result = await img.recognizeCurrentImageText();
for (const { string } of result) {
isnot(string, "火狐", 'The results are (as expected) incorrect, as Chinese was not set as the language.');
}
}
info("Setting the document to Chinese.");
document.documentElement.setAttribute("lang", "zh-Hans-CN");
info("Recognizing the image text");
{
const result = await img.recognizeCurrentImageText();
is(result.length, 1, "One word was found.");
is(result[0].string, "火狐", "The Chinese characters for Firefox are found.");
}
document.documentElement.setAttribute("lang", "en-US");
});
</script>
</pre>
</body>
</html>

Просмотреть файл

@ -14,7 +14,9 @@ using namespace mozilla::dom;
namespace mozilla::widget {
auto TextRecognition::FindText(imgIContainer& aImage) -> RefPtr<NativePromise> {
auto TextRecognition::FindText(imgIContainer& aImage,
const nsTArray<nsCString>& aLanguages)
-> RefPtr<NativePromise> {
// TODO: Maybe decode async.
RefPtr<gfx::SourceSurface> surface = aImage.GetFrame(
imgIContainer::FRAME_CURRENT,
@ -27,10 +29,11 @@ auto TextRecognition::FindText(imgIContainer& aImage) -> RefPtr<NativePromise> {
return NativePromise::CreateAndReject("Failed to get data surface"_ns,
__func__);
}
return FindText(*dataSurface);
return FindText(*dataSurface, aLanguages);
}
auto TextRecognition::FindText(gfx::DataSourceSurface& aSurface)
auto TextRecognition::FindText(gfx::DataSourceSurface& aSurface,
const nsTArray<nsCString>& aLanguages)
-> RefPtr<NativePromise> {
if (XRE_IsContentProcess()) {
auto* contentChild = ContentChild::GetSingleton();
@ -40,31 +43,32 @@ auto TextRecognition::FindText(gfx::DataSourceSurface& aSurface)
__func__);
}
auto promise = MakeRefPtr<NativePromise::Private>(__func__);
contentChild->SendFindImageText(*image)->Then(
GetCurrentSerialEventTarget(), __func__,
[promise](TextRecognitionResultOrError&& aResultOrError) {
switch (aResultOrError.type()) {
case TextRecognitionResultOrError::Type::TTextRecognitionResult:
promise->Resolve(
std::move(aResultOrError.get_TextRecognitionResult()),
__func__);
break;
case TextRecognitionResultOrError::Type::TnsCString:
promise->Reject(std::move(aResultOrError.get_nsCString()),
__func__);
break;
default:
MOZ_ASSERT_UNREACHABLE("Unknown result?");
promise->Reject("Unknown error"_ns, __func__);
break;
}
},
[promise](mozilla::ipc::ResponseRejectReason) {
promise->Reject("IPC rejection"_ns, __func__);
});
contentChild->SendFindImageText(*image, aLanguages)
->Then(
GetCurrentSerialEventTarget(), __func__,
[promise](TextRecognitionResultOrError&& aResultOrError) {
switch (aResultOrError.type()) {
case TextRecognitionResultOrError::Type::TTextRecognitionResult:
promise->Resolve(
std::move(aResultOrError.get_TextRecognitionResult()),
__func__);
break;
case TextRecognitionResultOrError::Type::TnsCString:
promise->Reject(std::move(aResultOrError.get_nsCString()),
__func__);
break;
default:
MOZ_ASSERT_UNREACHABLE("Unknown result?");
promise->Reject("Unknown error"_ns, __func__);
break;
}
},
[promise](mozilla::ipc::ResponseRejectReason) {
promise->Reject("IPC rejection"_ns, __func__);
});
return promise;
}
return DoFindText(aSurface);
return DoFindText(aSurface, aLanguages);
}
void TextRecognition::FillShadow(ShadowRoot& aShadow,
@ -101,7 +105,8 @@ void TextRecognition::FillShadow(ShadowRoot& aShadow,
}
#ifndef XP_MACOSX
auto TextRecognition::DoFindText(gfx::DataSourceSurface&)
auto TextRecognition::DoFindText(gfx::DataSourceSurface&,
const nsTArray<nsCString>&)
-> RefPtr<NativePromise> {
MOZ_RELEASE_ASSERT(XRE_IsParentProcess(),
"This should only run in the parent process");

Просмотреть файл

@ -34,12 +34,15 @@ class TextRecognition final {
static void FillShadow(dom::ShadowRoot&, const dom::TextRecognitionResult&);
static RefPtr<NativePromise> FindText(imgIContainer&);
static RefPtr<NativePromise> FindText(gfx::DataSourceSurface&);
static RefPtr<NativePromise> FindText(imgIContainer&,
const nsTArray<nsCString>&);
static RefPtr<NativePromise> FindText(gfx::DataSourceSurface&,
const nsTArray<nsCString>&);
protected:
// This should be implemented in the OS specific file.
static RefPtr<NativePromise> DoFindText(gfx::DataSourceSurface&);
static RefPtr<NativePromise> DoFindText(gfx::DataSourceSurface&,
const nsTArray<nsCString>&);
~TextRecognition() = default;
};

Просмотреть файл

@ -17,7 +17,8 @@
namespace mozilla::widget {
auto TextRecognition::DoFindText(gfx::DataSourceSurface& aSurface) -> RefPtr<NativePromise> {
auto TextRecognition::DoFindText(gfx::DataSourceSurface& aSurface,
const nsTArray<nsCString>& aLanguages) -> RefPtr<NativePromise> {
NS_OBJC_BEGIN_TRY_IGNORE_BLOCK
if (@available(macOS 10.15, *)) {
// TODO - Is this the most efficient path? Maybe we can write a new
@ -30,11 +31,19 @@ auto TextRecognition::DoFindText(gfx::DataSourceSurface& aSurface) -> RefPtr<Nat
auto promise = MakeRefPtr<NativePromise::Private>(__func__);
NSMutableArray* recognitionLanguages = [[NSMutableArray alloc] init];
for (const auto& locale : aLanguages) {
[recognitionLanguages addObject:nsCocoaUtils::ToNSString(locale)];
}
NS_DispatchBackgroundTask(
NS_NewRunnableFunction(
__func__,
[promise, imageRef] {
auto unrefImage = MakeScopeExit([&] { ::CGImageRelease(imageRef); });
[promise, imageRef, recognitionLanguages] {
auto unrefImage = MakeScopeExit([&] {
::CGImageRelease(imageRef);
[recognitionLanguages release];
});
dom::TextRecognitionResult result;
dom::TextRecognitionResult* pResult = &result;
@ -67,6 +76,10 @@ auto TextRecognition::DoFindText(gfx::DataSourceSurface& aSurface) -> RefPtr<Nat
}];
}];
textRecognitionRequest.recognitionLevel = VNRequestTextRecognitionLevelAccurate;
textRecognitionRequest.recognitionLanguages = recognitionLanguages;
textRecognitionRequest.usesLanguageCorrection = true;
// Send out the request. This blocks execution of this thread with an expensive
// CPU call.
NSError* error = nil;