From 2118316ba43963ddb478dfebaf7bf4fed6dc1e01 Mon Sep 17 00:00:00 2001 From: Dorel Luca Date: Mon, 21 Jun 2021 12:10:54 +0300 Subject: [PATCH] Backed out changeset 4891a17c55e2 (bug 1713627) for Browser-chrome failures in docshell/test/browser/browser_bug673087-1.js. CLOSED TREE --- Cargo.lock | 21 ++ browser/actors/BrowserTabChild.jsm | 5 +- .../base/content/appmenu-viewcache.inc.xhtml | 15 + browser/base/content/browser-doctype.inc | 2 + browser/base/content/browser-menubar.inc | 5 +- browser/base/content/browser.js | 49 ++- .../customizableui/CustomizableWidgets.jsm | 2 +- docshell/base/nsDocShell.cpp | 88 +++++- docshell/base/nsDocShell.h | 5 +- docshell/base/nsIDocShell.idl | 16 +- docshell/test/browser/browser.ini | 13 + docshell/test/browser/browser_bug134911.js | 2 +- docshell/test/browser/browser_bug1543077-1.js | 47 +++ docshell/test/browser/browser_bug1543077-2.js | 47 +++ docshell/test/browser/browser_bug1543077-3.js | 1 + docshell/test/browser/browser_bug1543077-4.js | 47 +++ docshell/test/browser/browser_bug1648464-1.js | 1 + docshell/test/browser/browser_bug1688368-1.js | 1 + docshell/test/browser/browser_bug234628-1.js | 18 +- docshell/test/browser/browser_bug234628-10.js | 9 +- docshell/test/browser/browser_bug234628-11.js | 9 +- docshell/test/browser/browser_bug234628-2.js | 20 +- docshell/test/browser/browser_bug234628-3.js | 20 +- docshell/test/browser/browser_bug234628-4.js | 9 +- docshell/test/browser/browser_bug234628-5.js | 9 +- docshell/test/browser/browser_bug234628-6.js | 12 +- docshell/test/browser/browser_bug234628-7.js | 49 +++ docshell/test/browser/browser_bug234628-8.js | 2 +- docshell/test/browser/browser_bug234628-9.js | 2 +- docshell/test/browser/browser_bug673087-2.js | 1 + docshell/test/browser/browser_bug92473.js | 2 +- .../test/browser/file_bug1543077-1-child.html | 11 + docshell/test/browser/file_bug1543077-1.html | 16 + .../test/browser/file_bug1543077-2-child.html | 11 + docshell/test/browser/file_bug1543077-2.html | 16 + .../test/browser/file_bug1543077-4-child.html | 11 + docshell/test/browser/file_bug1543077-4.html | 16 + .../test/browser/file_bug234628-7-child.html | 12 + .../file_bug234628-7-child.html^headers^ | 1 + docshell/test/browser/file_bug234628-7.html | 18 ++ docshell/test/browser/head.js | 47 ++- dom/base/nsDOMWindowUtils.cpp | 7 +- dom/html/nsHTMLDocument.cpp | 35 ++- dom/html/nsHTMLDocument.h | 3 + dom/interfaces/base/nsIBrowser.idl | 9 + dom/ipc/BrowserChild.cpp | 3 + dom/ipc/BrowserParent.cpp | 3 + dom/ipc/PBrowser.ipdl | 2 + intl/JapaneseDetector.h | 124 ++++++++ intl/moz.build | 2 + mobile/android/locales/l10n.toml | 4 + modules/libpref/init/StaticPrefList.yaml | 18 ++ parser/html/nsHtml5StreamParser.cpp | 150 ++++++++-- parser/html/nsHtml5StreamParser.h | 11 + parser/nsCharsetSource.h | 4 + .../rust/shift_or_euc/.cargo-checksum.json | 1 + third_party/rust/shift_or_euc/CONTRIBUTING.md | 38 +++ third_party/rust/shift_or_euc/COPYRIGHT | 9 + third_party/rust/shift_or_euc/Cargo.toml | 30 ++ third_party/rust/shift_or_euc/LICENSE-APACHE | 202 +++++++++++++ third_party/rust/shift_or_euc/LICENSE-MIT | 25 ++ third_party/rust/shift_or_euc/README.md | 73 +++++ .../rust/shift_or_euc/examples/detect.rs | 56 ++++ third_party/rust/shift_or_euc/src/lib.rs | 278 ++++++++++++++++++ .../rust/shift_or_euc_c/.cargo-checksum.json | 1 + .../rust/shift_or_euc_c/CONTRIBUTING.md | 38 +++ third_party/rust/shift_or_euc_c/COPYRIGHT | 9 + third_party/rust/shift_or_euc_c/Cargo.toml | 30 ++ .../rust/shift_or_euc_c/LICENSE-APACHE | 202 +++++++++++++ third_party/rust/shift_or_euc_c/LICENSE-MIT | 25 ++ third_party/rust/shift_or_euc_c/README.md | 13 + .../shift_or_euc_c/include/shift_or_euc.h | 88 ++++++ third_party/rust/shift_or_euc_c/src/lib.rs | 94 ++++++ toolkit/actors/ViewSourceChild.jsm | 23 +- .../content/widgets/browser-custom-element.js | 29 +- toolkit/library/rust/shared/Cargo.toml | 1 + toolkit/library/rust/shared/lib.rs | 1 + .../en-US/chrome/global/charsetMenu.dtd | 5 + .../chrome/global/charsetMenu.properties | 114 +++++++ toolkit/locales/jar.mn | 2 + toolkit/modules/CharsetMenu.jsm | 223 ++++++++++++++ toolkit/modules/moz.build | 4 + 82 files changed, 2538 insertions(+), 139 deletions(-) create mode 100644 docshell/test/browser/browser_bug1543077-1.js create mode 100644 docshell/test/browser/browser_bug1543077-2.js create mode 100644 docshell/test/browser/browser_bug1543077-4.js create mode 100644 docshell/test/browser/browser_bug234628-7.js create mode 100644 docshell/test/browser/file_bug1543077-1-child.html create mode 100644 docshell/test/browser/file_bug1543077-1.html create mode 100644 docshell/test/browser/file_bug1543077-2-child.html create mode 100644 docshell/test/browser/file_bug1543077-2.html create mode 100644 docshell/test/browser/file_bug1543077-4-child.html create mode 100644 docshell/test/browser/file_bug1543077-4.html create mode 100644 docshell/test/browser/file_bug234628-7-child.html create mode 100644 docshell/test/browser/file_bug234628-7-child.html^headers^ create mode 100644 docshell/test/browser/file_bug234628-7.html create mode 100644 intl/JapaneseDetector.h create mode 100644 third_party/rust/shift_or_euc/.cargo-checksum.json create mode 100644 third_party/rust/shift_or_euc/CONTRIBUTING.md create mode 100644 third_party/rust/shift_or_euc/COPYRIGHT create mode 100644 third_party/rust/shift_or_euc/Cargo.toml create mode 100644 third_party/rust/shift_or_euc/LICENSE-APACHE create mode 100644 third_party/rust/shift_or_euc/LICENSE-MIT create mode 100644 third_party/rust/shift_or_euc/README.md create mode 100644 third_party/rust/shift_or_euc/examples/detect.rs create mode 100644 third_party/rust/shift_or_euc/src/lib.rs create mode 100644 third_party/rust/shift_or_euc_c/.cargo-checksum.json create mode 100644 third_party/rust/shift_or_euc_c/CONTRIBUTING.md create mode 100644 third_party/rust/shift_or_euc_c/COPYRIGHT create mode 100644 third_party/rust/shift_or_euc_c/Cargo.toml create mode 100644 third_party/rust/shift_or_euc_c/LICENSE-APACHE create mode 100644 third_party/rust/shift_or_euc_c/LICENSE-MIT create mode 100644 third_party/rust/shift_or_euc_c/README.md create mode 100644 third_party/rust/shift_or_euc_c/include/shift_or_euc.h create mode 100644 third_party/rust/shift_or_euc_c/src/lib.rs create mode 100644 toolkit/locales/en-US/chrome/global/charsetMenu.dtd create mode 100644 toolkit/locales/en-US/chrome/global/charsetMenu.properties create mode 100644 toolkit/modules/CharsetMenu.jsm diff --git a/Cargo.lock b/Cargo.lock index 4a30954c3fd2..454efbf3048d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2025,6 +2025,7 @@ dependencies = [ "rusqlite", "rust_minidump_writer_linux", "rustc_version", + "shift_or_euc_c", "static_prefs", "storage", "unic-langid", @@ -4631,6 +4632,26 @@ dependencies = [ "opaque-debug", ] +[[package]] +name = "shift_or_euc" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f930dea4685b9803954b9d74cdc175c6d946a22f2eafe5aa2e9a58cdcae7da8c" +dependencies = [ + "encoding_rs", + "memchr", +] + +[[package]] +name = "shift_or_euc_c" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c81ec08c8a68c45c48d8ef58b80ce038cc9945891c4a4996761e2ec5cba05abc" +dependencies = [ + "encoding_rs", + "shift_or_euc", +] + [[package]] name = "shlex" version = "0.1.1" diff --git a/browser/actors/BrowserTabChild.jsm b/browser/actors/BrowserTabChild.jsm index 48a403b797d7..1bfe8cc74b4e 100644 --- a/browser/actors/BrowserTabChild.jsm +++ b/browser/actors/BrowserTabChild.jsm @@ -98,8 +98,9 @@ class BrowserTabChild extends JSWindowActorChild { } catch (e) {} break; - case "ForceEncodingDetection": - docShell.forceEncodingDetection(); + case "UpdateCharacterSet": + docShell.charset = message.data.value; + docShell.gatherCharsetMenuTelemetry(); break; } } diff --git a/browser/base/content/appmenu-viewcache.inc.xhtml b/browser/base/content/appmenu-viewcache.inc.xhtml index e793a5941e0c..36917bf2cade 100644 --- a/browser/base/content/appmenu-viewcache.inc.xhtml +++ b/browser/base/content/appmenu-viewcache.inc.xhtml @@ -547,6 +547,16 @@ + + + + + + + + @@ -585,6 +595,11 @@ class="subviewbutton subviewbutton-iconic" data-l10n-id="appmenu-taskmanager" oncommand="switchToTabHavingURI('about:performance', true)"/> + %browserDTD; + +%charsetDTD; %textcontextDTD; diff --git a/browser/base/content/browser-menubar.inc b/browser/base/content/browser-menubar.inc index cd348e8e7817..925302e86b0f 100644 --- a/browser/base/content/browser-menubar.inc +++ b/browser/base/content/browser-menubar.inc @@ -143,7 +143,8 @@ - + @@ -202,7 +203,7 @@ #ifdef XP_MACOSX diff --git a/browser/base/content/browser.js b/browser/base/content/browser.js index c68f586479cf..45a3812e593e 100644 --- a/browser/base/content/browser.js +++ b/browser/base/content/browser.js @@ -27,6 +27,7 @@ XPCOMUtils.defineLazyModuleGetters(this, { BrowserUtils: "resource://gre/modules/BrowserUtils.jsm", BrowserWindowTracker: "resource:///modules/BrowserWindowTracker.jsm", CFRPageActions: "resource://activity-stream/lib/CFRPageActions.jsm", + CharsetMenu: "resource://gre/modules/CharsetMenu.jsm", Color: "resource://gre/modules/Color.jsm", ContextualIdentityService: "resource://gre/modules/ContextualIdentityService.jsm", @@ -4883,6 +4884,24 @@ function updateUserContextUIIndicator() { hbox.hidden = false; } +/** + * Makes the Character Encoding menu enabled or disabled as appropriate. + * To be called when the View menu or the app menu is opened. + */ +function updateCharacterEncodingMenuState() { + let charsetMenu = document.getElementById("charsetMenu"); + // gBrowser is null on Mac when the menubar shows in the context of + // non-browser windows. The above elements may be null depending on + // what parts of the menubar are present. E.g. no app menu on Mac. + if (gBrowser && gBrowser.selectedBrowser.mayEnableCharacterEncodingMenu) { + if (charsetMenu) { + charsetMenu.removeAttribute("disabled"); + } + } else if (charsetMenu) { + charsetMenu.setAttribute("disabled", "true"); + } +} + var XULBrowserWindow = { // Stored Status, Link and Loading values status: "", @@ -7059,11 +7078,37 @@ function handleDroppedLink( } } -function BrowserForceEncodingDetection() { - gBrowser.selectedBrowser.forceEncodingDetection(); +function BrowserSetForcedCharacterSet(aCharset) { + if (aCharset) { + if (aCharset == "Japanese") { + aCharset = "Shift_JIS"; + } + gBrowser.selectedBrowser.characterSet = aCharset; + // Save the forced character-set + PlacesUIUtils.setCharsetForPage( + gBrowser.currentURI, + aCharset, + window + ).catch(Cu.reportError); + } + BrowserCharsetReload(); +} + +function BrowserCharsetReload() { BrowserReloadWithFlags(Ci.nsIWebNavigation.LOAD_FLAGS_CHARSET_CHANGE); } +function UpdateCurrentCharset(target) { + let selectedCharset = CharsetMenu.foldCharset( + gBrowser.selectedBrowser.characterSet, + gBrowser.selectedBrowser.charsetAutodetected + ); + for (let menuItem of target.getElementsByTagName("menuitem")) { + let isSelected = menuItem.getAttribute("charset") === selectedCharset; + menuItem.setAttribute("checked", isSelected); + } +} + var ToolbarContextMenu = { updateDownloadsAutoHide(popup) { let checkbox = document.getElementById( diff --git a/browser/components/customizableui/CustomizableWidgets.jsm b/browser/components/customizableui/CustomizableWidgets.jsm index 1d9fd4a71787..ff9271b8c4cf 100644 --- a/browser/components/customizableui/CustomizableWidgets.jsm +++ b/browser/components/customizableui/CustomizableWidgets.jsm @@ -439,7 +439,7 @@ const CustomizableWidgets = [ id: "characterencoding-button", l10nId: "repair-text-encoding-button", onCommand(aEvent) { - aEvent.view.BrowserForceEncodingDetection(); + aEvent.view.BrowserSetForcedCharacterSet("_autodetect_all"); }, }, { diff --git a/docshell/base/nsDocShell.cpp b/docshell/base/nsDocShell.cpp index 3c534088c8ee..fb0327ee068b 100644 --- a/docshell/base/nsDocShell.cpp +++ b/docshell/base/nsDocShell.cpp @@ -370,6 +370,7 @@ nsDocShell::nsDocShell(BrowsingContext* aBrowsingContext, : nsDocLoader(true), mContentWindowID(aContentWindowID), mBrowsingContext(aBrowsingContext), + mForcedCharset(nullptr), mParentCharset(nullptr), mTreeOwner(nullptr), mScrollbarPref(ScrollbarPreference::Auto), @@ -1526,7 +1527,7 @@ nsDocShell::GetCharset(nsACString& aCharset) { } NS_IMETHODIMP -nsDocShell::ForceEncodingDetection() { +nsDocShell::GatherCharsetMenuTelemetry() { nsCOMPtr viewer; GetContentViewer(getter_AddRefs(viewer)); if (!viewer) { @@ -1538,11 +1539,15 @@ nsDocShell::ForceEncodingDetection() { return NS_OK; } - mForcedAutodetection = true; - - LOGCHARSETMENU(("ENCODING_OVERRIDE_USED_AUTOMATIC")); - Telemetry::ScalarSet(Telemetry::ScalarID::ENCODING_OVERRIDE_USED_AUTOMATIC, - true); + if (mForcedAutodetection) { + LOGCHARSETMENU(("ENCODING_OVERRIDE_USED_AUTOMATIC")); + Telemetry::ScalarSet(Telemetry::ScalarID::ENCODING_OVERRIDE_USED_AUTOMATIC, + true); + } else { + LOGCHARSETMENU(("ENCODING_OVERRIDE_USED_MANUAL")); + Telemetry::ScalarSet(Telemetry::ScalarID::ENCODING_OVERRIDE_USED_MANUAL, + true); + } nsIURI* url = doc->GetOriginalURI(); bool isFileURL = url && SchemeIsFile(url); @@ -1556,6 +1561,28 @@ nsDocShell::ForceEncodingDetection() { Telemetry::AccumulateCategorical( Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::AutoOverridden); break; + case kCharsetFromUserForced: + case kCharsetFromUserForcedJapaneseAutoDetection: + LOGCHARSETMENU(("ManuallyOverridden")); + Telemetry::AccumulateCategorical( + Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::ManuallyOverridden); + break; + case kCharsetFromTopLevelDomain: + if (encoding == WINDOWS_1252_ENCODING) { + LOGCHARSETMENU(("UnlabeledInLk")); + Telemetry::AccumulateCategorical( + Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::UnlabeledInLk); + } else { + LOGCHARSETMENU(("UnlabeledJp")); + Telemetry::AccumulateCategorical( + Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::UnlabeledJp); + } + break; + case kCharsetFromFinalJapaneseAutoDetection: + LOGCHARSETMENU(("UnlabeledJp")); + Telemetry::AccumulateCategorical( + Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::UnlabeledJp); + break; case kCharsetFromInitialAutoDetectionASCII: // Deliberately no final version LOGCHARSETMENU(("UnlabeledAscii")); @@ -1630,6 +1657,31 @@ nsDocShell::ForceEncodingDetection() { return NS_OK; } +NS_IMETHODIMP +nsDocShell::SetCharset(const nsACString& aCharset) { + mForcedAutodetection = false; + if (aCharset.IsEmpty()) { + mForcedCharset = nullptr; + return NS_OK; + } + if (aCharset.EqualsLiteral("_autodetect_all")) { + mForcedCharset = WINDOWS_1252_ENCODING; + mForcedAutodetection = true; + return NS_OK; + } + const Encoding* encoding = Encoding::ForLabel(aCharset); + if (!encoding) { + // Reject unknown labels + return NS_ERROR_INVALID_ARG; + } + if (!encoding->IsAsciiCompatible() && encoding != ISO_2022_JP_ENCODING) { + // Reject XSS hazards + return NS_ERROR_INVALID_ARG; + } + mForcedCharset = encoding; + return NS_OK; +} + void nsDocShell::SetParentCharset(const Encoding*& aCharset, int32_t aCharsetSource, nsIPrincipal* aPrincipal) { @@ -1990,6 +2042,30 @@ nsDocShell::GetMayEnableCharacterEncodingMenu( return NS_OK; } +NS_IMETHODIMP +nsDocShell::GetCharsetAutodetected(bool* aCharsetAutodetected) { + *aCharsetAutodetected = false; + if (!mContentViewer) { + return NS_OK; + } + Document* doc = mContentViewer->GetDocument(); + if (!doc) { + return NS_OK; + } + int32_t source = doc->GetDocumentCharacterSetSource(); + + if ((source >= kCharsetFromInitialAutoDetectionASCII && + source <= kCharsetFromFinalAutoDetectionFile) || + source == kCharsetFromUserForcedJapaneseAutoDetection || + source == kCharsetFromPendingUserForcedAutoDetection || + source == kCharsetFromInitialUserForcedAutoDetection || + source == kCharsetFromFinalUserForcedAutoDetection) { + *aCharsetAutodetected = true; + } + + return NS_OK; +} + NS_IMETHODIMP nsDocShell::GetAllDocShellsInSubtree(int32_t aItemType, DocShellEnumeratorDirection aDirection, diff --git a/docshell/base/nsDocShell.h b/docshell/base/nsDocShell.h index 06219003af07..6363f96bb671 100644 --- a/docshell/base/nsDocShell.h +++ b/docshell/base/nsDocShell.h @@ -327,9 +327,9 @@ class nsDocShell final : public nsDocLoader, void SetInFrameSwap(bool aInSwap) { mInFrameSwap = aInSwap; } bool InFrameSwap(); - bool GetForcedAutodetection() { return mForcedAutodetection; } + const mozilla::Encoding* GetForcedCharset() { return mForcedCharset; } - void ResetForcedAutodetection() { mForcedAutodetection = false; } + bool GetForcedAutodetection() { return mForcedAutodetection; } mozilla::HTMLEditor* GetHTMLEditorInternal(); nsresult SetHTMLEditorInternal(mozilla::HTMLEditor* aHTMLEditor); @@ -1189,6 +1189,7 @@ class nsDocShell final : public nsDocLoader, mozilla::UniquePtr mColorMatrix; + const mozilla::Encoding* mForcedCharset; const mozilla::Encoding* mParentCharset; // WEAK REFERENCES BELOW HERE. diff --git a/docshell/base/nsIDocShell.idl b/docshell/base/nsIDocShell.idl index 6ae176e22a80..e513294932c2 100644 --- a/docshell/base/nsIDocShell.idl +++ b/docshell/base/nsIDocShell.idl @@ -494,10 +494,17 @@ interface nsIDocShell : nsIDocShellTreeItem /** * Upon getting, returns the canonical encoding label of the document * currently loaded into this docshell. + * + * Upon setting, sets the forced encoding for compatibility with legacy callers. */ - readonly attribute ACString charset; + attribute ACString charset; - void forceEncodingDetection(); + /** + * Called when the user chose an encoding override from the character + * encoding menu. Separate from the setter for the charset property to avoid + * extensions adding noise to the data. + */ + void gatherCharsetMenuTelemetry(); /** * In a child docshell, this is the charset of the parent docshell @@ -613,6 +620,11 @@ interface nsIDocShell : nsIDocShellTreeItem */ [infallible] readonly attribute boolean mayEnableCharacterEncodingMenu; + /** + * Indicates that the character encoding was autodetected. + */ + [infallible] readonly attribute boolean charsetAutodetected; + attribute nsIEditor editor; readonly attribute boolean editable; /* this docShell is editable */ readonly attribute boolean hasEditingSession; /* this docShell has an editing session */ diff --git a/docshell/test/browser/browser.ini b/docshell/test/browser/browser.ini index 4cd49780373d..fec0b9ed1a6c 100644 --- a/docshell/test/browser/browser.ini +++ b/docshell/test/browser/browser.ini @@ -21,6 +21,9 @@ support-files = file_bug234628-6-child.html file_bug234628-6-child.html^headers^ file_bug234628-6.html + file_bug234628-7-child.html + file_bug234628-7-child.html^headers^ + file_bug234628-7.html file_bug234628-8-child.html file_bug234628-8.html file_bug234628-9-child.html @@ -40,8 +43,14 @@ support-files = file_bug1328501.html file_bug1328501_frame.html file_bug1328501_framescript.js + file_bug1543077-1-child.html + file_bug1543077-1.html + file_bug1543077-2-child.html + file_bug1543077-2.html file_bug1543077-3-child.html file_bug1543077-3.html + file_bug1543077-4-child.html + file_bug1543077-4.html file_multiple_pushState.html file_onbeforeunload_0.html file_onbeforeunload_1.html @@ -76,7 +85,10 @@ skip-if = os == "linux" && bits == 64 && !debug # Bug 1607713 fission && os == "mac" && debug # Bug 1713903 - new Fission platform triage [browser_backforward_userinteraction_about.js] +[browser_bug1543077-1.js] +[browser_bug1543077-2.js] [browser_bug1543077-3.js] +[browser_bug1543077-4.js] [browser_bug1594938.js] [browser_bug1206879.js] [browser_bug1309900_crossProcessHistoryNavigation.js] @@ -106,6 +118,7 @@ skip-if = !fission || !crashreporter # On a crash we only keep history when fiss [browser_bug234628-4.js] [browser_bug234628-5.js] [browser_bug234628-6.js] +[browser_bug234628-7.js] [browser_bug234628-8.js] [browser_bug234628-9.js] [browser_bug349769.js] diff --git a/docshell/test/browser/browser_bug134911.js b/docshell/test/browser/browser_bug134911.js index 2fd3e82d4a65..c4e2ab42a91f 100644 --- a/docshell/test/browser/browser_bug134911.js +++ b/docshell/test/browser/browser_bug134911.js @@ -29,7 +29,7 @@ function afterOpen() { content.document.getElementById("testinput").value = TEXT.enteredText2; }).then(() => { /* Force the page encoding to Shift_JIS */ - BrowserForceEncodingDetection(); + BrowserSetForcedCharacterSet("Shift_JIS"); }); } diff --git a/docshell/test/browser/browser_bug1543077-1.js b/docshell/test/browser/browser_bug1543077-1.js new file mode 100644 index 000000000000..61507158e8dd --- /dev/null +++ b/docshell/test/browser/browser_bug1543077-1.js @@ -0,0 +1,47 @@ +function test() { + var rootDir = "http://mochi.test:8888/browser/docshell/test/browser/"; + runCharsetTest( + rootDir + "file_bug1543077-1.html", + afterOpen, + "Japanese", + afterChangeCharset + ); +} + +function afterOpen() { + is( + content.document.documentElement.textContent.indexOf("\u0434"), + 131, + "Parent doc should be IBM866 initially" + ); + + is( + content.frames[0].document.documentElement.textContent.indexOf("\u0434"), + 87, + "Child doc should be IBM866 initially" + ); +} + +function afterChangeCharset() { + is( + content.document.documentElement.textContent.indexOf("\u3042"), + 131, + "Parent doc should decode as EUC-JP subsequently" + ); + is( + content.frames[0].document.documentElement.textContent.indexOf("\u3042"), + 87, + "Child doc should decode as EUC-JP subsequently" + ); + + is( + content.document.characterSet, + "EUC-JP", + "Parent doc should report EUC-JP subsequently" + ); + is( + content.frames[0].document.characterSet, + "EUC-JP", + "Child doc should report EUC-JP subsequently" + ); +} diff --git a/docshell/test/browser/browser_bug1543077-2.js b/docshell/test/browser/browser_bug1543077-2.js new file mode 100644 index 000000000000..30736d7c76ad --- /dev/null +++ b/docshell/test/browser/browser_bug1543077-2.js @@ -0,0 +1,47 @@ +function test() { + var rootDir = "http://mochi.test:8888/browser/docshell/test/browser/"; + runCharsetTest( + rootDir + "file_bug1543077-2.html", + afterOpen, + "Japanese", + afterChangeCharset + ); +} + +function afterOpen() { + is( + content.document.documentElement.textContent.indexOf("\u0412"), + 134, + "Parent doc should be IBM866 initially" + ); + + is( + content.frames[0].document.documentElement.textContent.indexOf("\u0412"), + 90, + "Child doc should be IBM866 initially" + ); +} + +function afterChangeCharset() { + is( + content.document.documentElement.textContent.indexOf("\u3042"), + 134, + "Parent doc should decode as Shift_JIS subsequently" + ); + is( + content.frames[0].document.documentElement.textContent.indexOf("\u3042"), + 90, + "Child doc should decode as Shift_JIS subsequently" + ); + + is( + content.document.characterSet, + "Shift_JIS", + "Parent doc should report Shift_JIS subsequently" + ); + is( + content.frames[0].document.characterSet, + "Shift_JIS", + "Child doc should report Shift_JIS subsequently" + ); +} diff --git a/docshell/test/browser/browser_bug1543077-3.js b/docshell/test/browser/browser_bug1543077-3.js index 7cef4aef1011..fea8c6a3656a 100644 --- a/docshell/test/browser/browser_bug1543077-3.js +++ b/docshell/test/browser/browser_bug1543077-3.js @@ -3,6 +3,7 @@ function test() { runCharsetTest( rootDir + "file_bug1543077-3.html", afterOpen, + "Japanese", afterChangeCharset ); } diff --git a/docshell/test/browser/browser_bug1543077-4.js b/docshell/test/browser/browser_bug1543077-4.js new file mode 100644 index 000000000000..ba20352b7f80 --- /dev/null +++ b/docshell/test/browser/browser_bug1543077-4.js @@ -0,0 +1,47 @@ +function test() { + var rootDir = "http://mochi.test:8888/browser/docshell/test/browser/"; + runCharsetTest( + rootDir + "file_bug1543077-4.html", + afterOpen, + "Japanese", + afterChangeCharset + ); +} + +function afterOpen() { + is( + content.document.documentElement.textContent.indexOf("\u0434"), + 131, + "Parent doc should be IBM866 initially" + ); + + is( + content.frames[0].document.documentElement.textContent.indexOf("\u0412"), + 90, + "Child doc should be IBM866 initially" + ); +} + +function afterChangeCharset() { + is( + content.document.documentElement.textContent.indexOf("\u3042"), + 131, + "Parent doc should decode as EUC-JP subsequently" + ); + is( + content.frames[0].document.documentElement.textContent.indexOf("\u3042"), + 90, + "Child doc should decode as Shift_JIS subsequently" + ); + + is( + content.document.characterSet, + "EUC-JP", + "Parent doc should report EUC-JP subsequently" + ); + is( + content.frames[0].document.characterSet, + "Shift_JIS", + "Child doc should report Shift_JIS subsequently" + ); +} diff --git a/docshell/test/browser/browser_bug1648464-1.js b/docshell/test/browser/browser_bug1648464-1.js index c2a8093a3d43..d643a253e9e3 100644 --- a/docshell/test/browser/browser_bug1648464-1.js +++ b/docshell/test/browser/browser_bug1648464-1.js @@ -3,6 +3,7 @@ function test() { runCharsetTest( rootDir + "file_bug1648464-1.html", afterOpen, + "_autodetect_all", afterChangeCharset ); } diff --git a/docshell/test/browser/browser_bug1688368-1.js b/docshell/test/browser/browser_bug1688368-1.js index 04fc3dd9a8f8..ac3b8f2a38b5 100644 --- a/docshell/test/browser/browser_bug1688368-1.js +++ b/docshell/test/browser/browser_bug1688368-1.js @@ -3,6 +3,7 @@ function test() { runCharsetTest( rootDir + "file_bug1688368-1.sjs", afterOpen, + "UTF-8", afterChangeCharset ); } diff --git a/docshell/test/browser/browser_bug234628-1.js b/docshell/test/browser/browser_bug234628-1.js index 566da65bcabe..e26b582ff790 100644 --- a/docshell/test/browser/browser_bug234628-1.js +++ b/docshell/test/browser/browser_bug234628-1.js @@ -3,6 +3,7 @@ function test() { runCharsetTest( rootDir + "file_bug234628-1.html", afterOpen, + "windows-1251", afterChangeCharset ); } @@ -23,25 +24,24 @@ function afterOpen() { function afterChangeCharset() { is( - content.document.documentElement.textContent.indexOf("\u20AC"), + content.document.documentElement.textContent.indexOf("\u0402"), 129, - "Parent doc should be windows-1252 subsequently" + "Parent doc should decode as windows-1251 subsequently" ); - is( - content.frames[0].document.documentElement.textContent.indexOf("\u20AC"), + content.frames[0].document.documentElement.textContent.indexOf("\u0402"), 85, - "Child doc should be windows-1252 subsequently" + "Child doc should decode as windows-1251 subsequently" ); is( content.document.characterSet, - "windows-1252", - "Parent doc should report windows-1252 subsequently" + "windows-1251", + "Parent doc should report windows-1251 subsequently" ); is( content.frames[0].document.characterSet, - "windows-1252", - "Child doc should report windows-1252 subsequently" + "windows-1251", + "Child doc should report windows-1251 subsequently" ); } diff --git a/docshell/test/browser/browser_bug234628-10.js b/docshell/test/browser/browser_bug234628-10.js index 8fb51cf27cd6..d507378ed64c 100644 --- a/docshell/test/browser/browser_bug234628-10.js +++ b/docshell/test/browser/browser_bug234628-10.js @@ -3,6 +3,7 @@ function test() { runCharsetTest( rootDir + "file_bug234628-10.html", afterOpen, + "windows-1251", afterChangeCharset ); } @@ -23,9 +24,9 @@ function afterOpen() { function afterChangeCharset() { is( - content.document.documentElement.textContent.indexOf("\u20AC"), + content.document.documentElement.textContent.indexOf("\u0402"), 151, - "Parent doc should be windows-1252 initially" + "Parent doc should decode as windows-1251 subsequently" ); is( content.frames[0].document.documentElement.textContent.indexOf("\u20AC"), @@ -35,8 +36,8 @@ function afterChangeCharset() { is( content.document.characterSet, - "windows-1252", - "Parent doc should report windows-1252 subsequently" + "windows-1251", + "Parent doc should report windows-1251 subsequently" ); is( content.frames[0].document.characterSet, diff --git a/docshell/test/browser/browser_bug234628-11.js b/docshell/test/browser/browser_bug234628-11.js index d11645ff76d7..be71746aad0f 100644 --- a/docshell/test/browser/browser_bug234628-11.js +++ b/docshell/test/browser/browser_bug234628-11.js @@ -3,6 +3,7 @@ function test() { runCharsetTest( rootDir + "file_bug234628-11.html", afterOpen, + "windows-1251", afterChangeCharset ); } @@ -23,9 +24,9 @@ function afterOpen() { function afterChangeCharset() { is( - content.document.documentElement.textContent.indexOf("\u20AC"), + content.document.documentElement.textContent.indexOf("\u0402"), 193, - "Parent doc should be windows-1252 subsequently" + "Parent doc should decode as windows-1251 subsequently" ); is( content.frames[0].document.documentElement.textContent.indexOf("\u20AC"), @@ -35,8 +36,8 @@ function afterChangeCharset() { is( content.document.characterSet, - "windows-1252", - "Parent doc should report windows-1252 subsequently" + "windows-1251", + "Parent doc should report windows-1251 subsequently" ); is( content.frames[0].document.characterSet, diff --git a/docshell/test/browser/browser_bug234628-2.js b/docshell/test/browser/browser_bug234628-2.js index da93dc2ac254..bad7a3c44265 100644 --- a/docshell/test/browser/browser_bug234628-2.js +++ b/docshell/test/browser/browser_bug234628-2.js @@ -3,6 +3,7 @@ function test() { runCharsetTest( rootDir + "file_bug234628-2.html", afterOpen, + "windows-1251", afterChangeCharset ); } @@ -25,25 +26,26 @@ function afterOpen() { function afterChangeCharset() { is( - content.document.documentElement.textContent.indexOf("\u20AC"), + content.document.documentElement.textContent.indexOf("\u0402"), 129, - "Parent doc should be windows-1252 subsequently" + "Parent doc should decode as windows-1251 subsequently" ); - is( - content.frames[0].document.documentElement.textContent.indexOf("\u20AC"), + content.frames[0].document.documentElement.textContent.indexOf( + "\u0432\u201A\u00AC" + ), 78, - "Child doc should be UTF-8 subsequently" + "Child doc should decode as windows-1251 subsequently" ); is( content.document.characterSet, - "windows-1252", - "Parent doc should report windows-1252 subsequently" + "windows-1251", + "Parent doc should report windows-1251 subsequently" ); is( content.frames[0].document.characterSet, - "UTF-8", - "Child doc should report UTF-8 subsequently" + "windows-1251", + "Child doc should report windows-1251 subsequently" ); } diff --git a/docshell/test/browser/browser_bug234628-3.js b/docshell/test/browser/browser_bug234628-3.js index 8a143b51a62d..218a81e144e7 100644 --- a/docshell/test/browser/browser_bug234628-3.js +++ b/docshell/test/browser/browser_bug234628-3.js @@ -3,6 +3,7 @@ function test() { runCharsetTest( rootDir + "file_bug234628-3.html", afterOpen, + "windows-1251", afterChangeCharset ); } @@ -23,25 +24,26 @@ function afterOpen() { function afterChangeCharset() { is( - content.document.documentElement.textContent.indexOf("\u20AC"), + content.document.documentElement.textContent.indexOf("\u0402"), 118, - "Parent doc should be windows-1252 subsequently" + "Parent doc should decode as windows-1251 subsequently" ); - is( - content.frames[0].document.documentElement.textContent.indexOf("\u20AC"), + content.frames[0].document.documentElement.textContent.indexOf( + "\u0432\u201A\u00AC" + ), 73, - "Child doc should be utf-8 subsequently" + "Child doc should decode as windows-1251 subsequently" ); is( content.document.characterSet, - "windows-1252", - "Parent doc should report windows-1252 subsequently" + "windows-1251", + "Parent doc should report windows-1251 subsequently" ); is( content.frames[0].document.characterSet, - "UTF-8", - "Child doc should report UTF-8 subsequently" + "windows-1251", + "Child doc should report windows-1251 subsequently" ); } diff --git a/docshell/test/browser/browser_bug234628-4.js b/docshell/test/browser/browser_bug234628-4.js index 19ec0f8dbfd9..c7c629bf3eb9 100644 --- a/docshell/test/browser/browser_bug234628-4.js +++ b/docshell/test/browser/browser_bug234628-4.js @@ -3,6 +3,7 @@ function test() { runCharsetTest( rootDir + "file_bug234628-4.html", afterOpen, + "windows-1251", afterChangeCharset ); } @@ -23,9 +24,9 @@ function afterOpen() { function afterChangeCharset() { is( - content.document.documentElement.textContent.indexOf("\u20AC"), + content.document.documentElement.textContent.indexOf("\u0402"), 132, - "Parent doc should decode as windows-1252 subsequently" + "Parent doc should decode as windows-1251 subsequently" ); is( content.frames[0].document.documentElement.textContent.indexOf("\u20AC"), @@ -35,8 +36,8 @@ function afterChangeCharset() { is( content.document.characterSet, - "windows-1252", - "Parent doc should report windows-1252 subsequently" + "windows-1251", + "Parent doc should report windows-1251 subsequently" ); is( content.frames[0].document.characterSet, diff --git a/docshell/test/browser/browser_bug234628-5.js b/docshell/test/browser/browser_bug234628-5.js index 77753ed78d47..6c3462b9fc51 100644 --- a/docshell/test/browser/browser_bug234628-5.js +++ b/docshell/test/browser/browser_bug234628-5.js @@ -3,6 +3,7 @@ function test() { runCharsetTest( rootDir + "file_bug234628-5.html", afterOpen, + "windows-1251", afterChangeCharset ); } @@ -23,9 +24,9 @@ function afterOpen() { function afterChangeCharset() { is( - content.document.documentElement.textContent.indexOf("\u20AC"), + content.document.documentElement.textContent.indexOf("\u0402"), 146, - "Parent doc should be windows-1252 subsequently" + "Parent doc should decode as windows-1251 subsequently" ); is( content.frames[0].document.documentElement.textContent.indexOf("\u20AC"), @@ -35,8 +36,8 @@ function afterChangeCharset() { is( content.document.characterSet, - "windows-1252", - "Parent doc should report windows-1252 subsequently" + "windows-1251", + "Parent doc should report windows-1251 subsequently" ); is( content.frames[0].document.characterSet, diff --git a/docshell/test/browser/browser_bug234628-6.js b/docshell/test/browser/browser_bug234628-6.js index 88ff6c1a8280..8b5994d52da5 100644 --- a/docshell/test/browser/browser_bug234628-6.js +++ b/docshell/test/browser/browser_bug234628-6.js @@ -3,6 +3,7 @@ function test() { runCharsetTest( rootDir + "file_bug234628-6.html", afterOpen, + "windows-1251", afterChangeCharset ); } @@ -23,21 +24,20 @@ function afterOpen() { function afterChangeCharset() { is( - content.document.documentElement.textContent.indexOf("\u20AC"), + content.document.documentElement.textContent.indexOf("\u0402"), 190, - "Parent doc should be windows-1252 subsequently" + "Parent doc should decode as windows-1251 subsequently" ); - is( content.frames[0].document.documentElement.textContent.indexOf("\u20AC"), 109, - "Child doc should be utf-16 subsequently" + "Child doc should decode as utf-16 subsequently" ); is( content.document.characterSet, - "windows-1252", - "Parent doc should report windows-1252 subsequently" + "windows-1251", + "Parent doc should report windows-1251 subsequently" ); is( content.frames[0].document.characterSet, diff --git a/docshell/test/browser/browser_bug234628-7.js b/docshell/test/browser/browser_bug234628-7.js new file mode 100644 index 000000000000..10c4b432f96e --- /dev/null +++ b/docshell/test/browser/browser_bug234628-7.js @@ -0,0 +1,49 @@ +function test() { + var rootDir = "http://mochi.test:8888/browser/docshell/test/browser/"; + runCharsetTest( + rootDir + "file_bug234628-7.html", + afterOpen, + "windows-1251", + afterChangeCharset + ); +} + +function afterOpen() { + is( + content.document.documentElement.textContent.indexOf("\u20AC"), + 188, + "Parent doc should be windows-1252 initially" + ); + + is( + content.frames[0].document.documentElement.textContent.indexOf("\u20AC"), + 107, + "Child doc should be utf-8 initially" + ); +} + +function afterChangeCharset() { + is( + content.document.documentElement.textContent.indexOf("\u0402"), + 188, + "Parent doc should decode as windows-1251 subsequently" + ); + is( + content.frames[0].document.documentElement.textContent.indexOf( + "\u0432\u201A\u00AC" + ), + 107, + "Child doc should decode as windows-1251 subsequently" + ); + + is( + content.document.characterSet, + "windows-1251", + "Parent doc should report windows-1251 subsequently" + ); + is( + content.frames[0].document.characterSet, + "windows-1251", + "Child doc should report windows-1251 subsequently" + ); +} diff --git a/docshell/test/browser/browser_bug234628-8.js b/docshell/test/browser/browser_bug234628-8.js index 024a3d4d64fb..1c27619f2e13 100644 --- a/docshell/test/browser/browser_bug234628-8.js +++ b/docshell/test/browser/browser_bug234628-8.js @@ -1,6 +1,6 @@ function test() { var rootDir = "http://mochi.test:8888/browser/docshell/test/browser/"; - runCharsetCheck(rootDir + "file_bug234628-8.html", afterOpen); + runCharsetTest(rootDir + "file_bug234628-8.html", afterOpen); } function afterOpen() { diff --git a/docshell/test/browser/browser_bug234628-9.js b/docshell/test/browser/browser_bug234628-9.js index ceb7dc4e63f2..2ba714c09cad 100644 --- a/docshell/test/browser/browser_bug234628-9.js +++ b/docshell/test/browser/browser_bug234628-9.js @@ -1,6 +1,6 @@ function test() { var rootDir = "http://mochi.test:8888/browser/docshell/test/browser/"; - runCharsetCheck(rootDir + "file_bug234628-9.html", afterOpen); + runCharsetTest(rootDir + "file_bug234628-9.html", afterOpen); } function afterOpen() { diff --git a/docshell/test/browser/browser_bug673087-2.js b/docshell/test/browser/browser_bug673087-2.js index 13a7a2a82cc6..6ec472ad6f23 100644 --- a/docshell/test/browser/browser_bug673087-2.js +++ b/docshell/test/browser/browser_bug673087-2.js @@ -3,6 +3,7 @@ function test() { runCharsetTest( rootDir + "file_bug673087-2.html", afterOpen, + "windows-1252", afterChangeCharset ); } diff --git a/docshell/test/browser/browser_bug92473.js b/docshell/test/browser/browser_bug92473.js index 7e386f5ee9f1..3d8e8cc6b7e5 100644 --- a/docshell/test/browser/browser_bug92473.js +++ b/docshell/test/browser/browser_bug92473.js @@ -34,7 +34,7 @@ function afterOpen() { /* Test that the content on load is the expected wrong decoding */ testContent(wrongText).then(() => { - BrowserForceEncodingDetection(); + BrowserSetForcedCharacterSet("Shift_JIS"); }); } diff --git a/docshell/test/browser/file_bug1543077-1-child.html b/docshell/test/browser/file_bug1543077-1-child.html new file mode 100644 index 000000000000..d244b27717bf --- /dev/null +++ b/docshell/test/browser/file_bug1543077-1-child.html @@ -0,0 +1,11 @@ + + + + +No encoding declaration in parent or child + + +

Hiragana letter a if decoded as EUC-JP: ¤¢

+ + + diff --git a/docshell/test/browser/file_bug1543077-1.html b/docshell/test/browser/file_bug1543077-1.html new file mode 100644 index 000000000000..4d37ec18fc4a --- /dev/null +++ b/docshell/test/browser/file_bug1543077-1.html @@ -0,0 +1,16 @@ + + + + +No encoding declaration in parent or child + + +

No encoding declaration in parent or child

+ +

Hiragana letter a if decoded as EUC-JP: ¤¢

+ + + + + + diff --git a/docshell/test/browser/file_bug1543077-2-child.html b/docshell/test/browser/file_bug1543077-2-child.html new file mode 100644 index 000000000000..c3415e97a490 --- /dev/null +++ b/docshell/test/browser/file_bug1543077-2-child.html @@ -0,0 +1,11 @@ + + + + +No encoding declaration in parent or child + + +

Hiragana letter a if decoded as Shift_JIS: ‚ 

+ + + diff --git a/docshell/test/browser/file_bug1543077-2.html b/docshell/test/browser/file_bug1543077-2.html new file mode 100644 index 000000000000..c0ad81570f7c --- /dev/null +++ b/docshell/test/browser/file_bug1543077-2.html @@ -0,0 +1,16 @@ + + + + +No encoding declaration in parent or child + + +

No encoding declaration in parent or child

+ +

Hiragana letter a if decoded as Shift_JIS: ‚ 

+ + + + + + diff --git a/docshell/test/browser/file_bug1543077-4-child.html b/docshell/test/browser/file_bug1543077-4-child.html new file mode 100644 index 000000000000..c3415e97a490 --- /dev/null +++ b/docshell/test/browser/file_bug1543077-4-child.html @@ -0,0 +1,11 @@ + + + + +No encoding declaration in parent or child + + +

Hiragana letter a if decoded as Shift_JIS: ‚ 

+ + + diff --git a/docshell/test/browser/file_bug1543077-4.html b/docshell/test/browser/file_bug1543077-4.html new file mode 100644 index 000000000000..b8feb4cba66f --- /dev/null +++ b/docshell/test/browser/file_bug1543077-4.html @@ -0,0 +1,16 @@ + + + + +No encoding declaration in parent or child + + +

No encoding declaration in parent or child

+ +

Hiragana letter a if decoded as EUC-JP: ¤¢

+ + + + + + diff --git a/docshell/test/browser/file_bug234628-7-child.html b/docshell/test/browser/file_bug234628-7-child.html new file mode 100644 index 000000000000..c761ace1016b --- /dev/null +++ b/docshell/test/browser/file_bug234628-7-child.html @@ -0,0 +1,12 @@ + + + + +meta declaration in parent and BOMless UTF-8 with HTTP charset in child + + +

Euro sign if decoded as UTF-8: €

+

a with diaeresis if decoded as UTF-8: ä

+ + + diff --git a/docshell/test/browser/file_bug234628-7-child.html^headers^ b/docshell/test/browser/file_bug234628-7-child.html^headers^ new file mode 100644 index 000000000000..2d1c08b9e8aa --- /dev/null +++ b/docshell/test/browser/file_bug234628-7-child.html^headers^ @@ -0,0 +1 @@ +Content-Type: text/html; charset=utf-8 diff --git a/docshell/test/browser/file_bug234628-7.html b/docshell/test/browser/file_bug234628-7.html new file mode 100644 index 000000000000..7cb506096d9c --- /dev/null +++ b/docshell/test/browser/file_bug234628-7.html @@ -0,0 +1,18 @@ + + + + + +meta declaration in parent and BOMless UTF-8 with HTTP charset in child + + +

meta declaration in parent and BOMless UTF-8 with HTTP charset in child

+ +

Euro sign if decoded as Windows-1252: €

+

a with diaeresis if decoded as Windows-1252: ä

+ + + + + + diff --git a/docshell/test/browser/head.js b/docshell/test/browser/head.js index dd6d9242b9e1..47a649606f6b 100644 --- a/docshell/test/browser/head.js +++ b/docshell/test/browser/head.js @@ -63,22 +63,34 @@ function timelineTestOpenUrl(url) { } /** - * Helper function for encoding override tests, loads URL, runs check1, - * forces encoding detection, runs check2. + * Helper function for charset tests. It loads |url| in a new tab, + * runs |check1| in a ContentTask when the page is ready, switches the + * charset to |charset|, and then runs |check2| in a ContentTask when + * the page has finished reloading. + * + * |charset| and |check2| can be omitted, in which case the test + * finishes when |check1| completes. */ -function runCharsetTest(url, check1, check2) { +function runCharsetTest(url, check1, charset, check2) { waitForExplicitFinish(); BrowserTestUtils.openNewForegroundTab(gBrowser, url, true).then(afterOpen); function afterOpen() { - BrowserTestUtils.browserLoaded(gBrowser.selectedBrowser).then( - afterChangeCharset - ); + if (charset) { + BrowserTestUtils.browserLoaded(gBrowser.selectedBrowser).then( + afterChangeCharset + ); - SpecialPowers.spawn(gBrowser.selectedBrowser, [], check1).then(() => { - BrowserForceEncodingDetection(); - }); + SpecialPowers.spawn(gBrowser.selectedBrowser, [], check1).then(() => { + BrowserSetForcedCharacterSet(charset); + }); + } else { + SpecialPowers.spawn(gBrowser.selectedBrowser, [], check1).then(() => { + gBrowser.removeCurrentTab(); + finish(); + }); + } } function afterChangeCharset() { @@ -89,23 +101,6 @@ function runCharsetTest(url, check1, check2) { } } -/** - * Helper function for charset tests. It loads |url| in a new tab, - * runs |check|. - */ -function runCharsetCheck(url, check) { - waitForExplicitFinish(); - - BrowserTestUtils.openNewForegroundTab(gBrowser, url, true).then(afterOpen); - - function afterOpen() { - SpecialPowers.spawn(gBrowser.selectedBrowser, [], check).then(() => { - gBrowser.removeCurrentTab(); - finish(); - }); - } -} - async function pushState(url, frameId) { info( `Doing a pushState, expecting to load ${url} ${ diff --git a/dom/base/nsDOMWindowUtils.cpp b/dom/base/nsDOMWindowUtils.cpp index 1aef13ee9340..5b42456e5bda 100644 --- a/dom/base/nsDOMWindowUtils.cpp +++ b/dom/base/nsDOMWindowUtils.cpp @@ -362,11 +362,8 @@ nsDOMWindowUtils::GetDocCharsetIsForced(bool* aIsForced) { *aIsForced = false; Document* doc = GetDocument(); - if (doc) { - auto source = doc->GetDocumentCharacterSetSource(); - *aIsForced = source == kCharsetFromInitialUserForcedAutoDetection || - source == kCharsetFromFinalUserForcedAutoDetection; - } + *aIsForced = + doc && doc->GetDocumentCharacterSetSource() >= kCharsetFromUserForced; return NS_OK; } diff --git a/dom/html/nsHTMLDocument.cpp b/dom/html/nsHTMLDocument.cpp index c92ba1b2e29e..e2fddbdbcabb 100644 --- a/dom/html/nsHTMLDocument.cpp +++ b/dom/html/nsHTMLDocument.cpp @@ -206,7 +206,7 @@ void nsHTMLDocument::TryUserForcedCharset(nsIContentViewer* aCv, nsIDocShell* aDocShell, int32_t& aCharsetSource, NotNull& aEncoding) { - if (aCharsetSource >= kCharsetFromXmlDeclarationUtf16) { + if (kCharsetFromUserForced <= aCharsetSource) { return; } @@ -215,11 +215,20 @@ void nsHTMLDocument::TryUserForcedCharset(nsIContentViewer* aCv, return; } - if (aDocShell && nsDocShell::Cast(aDocShell)->GetForcedAutodetection()) { + if (aDocShell) { // This is the Character Encoding menu code path in Firefox - aEncoding = WINDOWS_1252_ENCODING; - aCharsetSource = kCharsetFromPendingUserForcedAutoDetection; - nsDocShell::Cast(aDocShell)->ResetForcedAutodetection(); + auto encoding = nsDocShell::Cast(aDocShell)->GetForcedCharset(); + + if (encoding) { + if (!IsAsciiCompatible(encoding)) { + return; + } + aEncoding = WrapNotNull(encoding); + aCharsetSource = nsDocShell::Cast(aDocShell)->GetForcedAutodetection() + ? kCharsetFromPendingUserForcedAutoDetection + : kCharsetFromUserForced; + aDocShell->SetCharset(""_ns); + } } } @@ -229,7 +238,7 @@ void nsHTMLDocument::TryParentCharset(nsIDocShell* aDocShell, if (!aDocShell) { return; } - if (aCharsetSource >= kCharsetFromXmlDeclarationUtf16) { + if (aCharsetSource >= kCharsetFromUserForced) { return; } @@ -241,7 +250,9 @@ void nsHTMLDocument::TryParentCharset(nsIDocShell* aDocShell, if (!parentCharset) { return; } - if (kCharsetFromPendingUserForcedAutoDetection == parentSource || + if (kCharsetFromUserForced == parentSource || + kCharsetFromUserForcedJapaneseAutoDetection == parentSource || + kCharsetFromPendingUserForcedAutoDetection == parentSource || kCharsetFromInitialUserForcedAutoDetection == parentSource || kCharsetFromFinalUserForcedAutoDetection == parentSource) { if (WillIgnoreCharsetOverride() || @@ -250,7 +261,11 @@ void nsHTMLDocument::TryParentCharset(nsIDocShell* aDocShell, return; } aEncoding = WrapNotNull(parentCharset); - aCharsetSource = kCharsetFromPendingUserForcedAutoDetection; + aCharsetSource = + (kCharsetFromUserForced == parentSource || + kCharsetFromUserForcedJapaneseAutoDetection == parentSource) + ? kCharsetFromUserForced + : kCharsetFromPendingUserForcedAutoDetection; return; } @@ -678,9 +693,11 @@ bool nsHTMLDocument::WillIgnoreCharsetOverride() { switch (mCharacterSetSource) { case kCharsetUninitialized: case kCharsetFromFallback: + case kCharsetFromTopLevelDomain: case kCharsetFromDocTypeDefault: case kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8: case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD: + case kCharsetFromFinalJapaneseAutoDetection: case kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8: case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD: case kCharsetFromParentFrame: @@ -688,6 +705,8 @@ bool nsHTMLDocument::WillIgnoreCharsetOverride() { case kCharsetFromMetaPrescan: case kCharsetFromMetaTag: case kCharsetFromChannel: + case kCharsetFromUserForced: + case kCharsetFromUserForcedJapaneseAutoDetection: return false; } diff --git a/dom/html/nsHTMLDocument.h b/dom/html/nsHTMLDocument.h index a235f540458c..62fd6686c462 100644 --- a/dom/html/nsHTMLDocument.h +++ b/dom/html/nsHTMLDocument.h @@ -176,6 +176,9 @@ class nsHTMLDocument : public mozilla::dom::Document { void TryUserForcedCharset(nsIContentViewer* aCv, nsIDocShell* aDocShell, int32_t& aCharsetSource, NotNull& aEncoding); + static void TryCacheCharset(nsICachingChannel* aCachingChannel, + int32_t& aCharsetSource, + NotNull& aEncoding); void TryParentCharset(nsIDocShell* aDocShell, int32_t& charsetSource, NotNull& aEncoding); diff --git a/dom/interfaces/base/nsIBrowser.idl b/dom/interfaces/base/nsIBrowser.idl index d107e7d47eba..1a05957dba83 100644 --- a/dom/interfaces/base/nsIBrowser.idl +++ b/dom/interfaces/base/nsIBrowser.idl @@ -67,6 +67,12 @@ interface nsIBrowser : nsISupports */ attribute boolean mayEnableCharacterEncodingMenu; + /** + * Whether or not the character encoding was detected by analyzing + * content (as opposed to reading a protocol label). + */ + attribute boolean charsetAutodetected; + /** * Called by Gecko to update the browser when its state changes. * @@ -96,6 +102,8 @@ interface nsIBrowser : nsISupports * @param aCharset the character set of the document * @param aMayEnableCharacterEncodingMenu whether or not the content encoding * menu may be enabled + * @param aCharsetAutodetected whether or not the given character set was + * autodetected * @param aDocumentURI the URI of the new document * @param aTitle the title of the new doucment * @param aContentPrincipal the security principal of the new document @@ -112,6 +120,7 @@ interface nsIBrowser : nsISupports void updateForLocationChange(in nsIURI aLocation, in AString aCharset, in boolean aMayEnableCharacterEncodingMenu, + in boolean aCharsetAutodetected, in nsIURI aDocumentURI, in AString aTitle, in nsIPrincipal aContentPrincipal, diff --git a/dom/ipc/BrowserChild.cpp b/dom/ipc/BrowserChild.cpp index 96fa0f70dc40..4cdedcf9fbfc 100644 --- a/dom/ipc/BrowserChild.cpp +++ b/dom/ipc/BrowserChild.cpp @@ -3602,6 +3602,7 @@ NS_IMETHODIMP BrowserChild::OnStateChange(nsIWebProgress* aWebProgress, stateChangeData->isNavigating() = docShell->GetIsNavigating(); stateChangeData->mayEnableCharacterEncodingMenu() = docShell->GetMayEnableCharacterEncodingMenu(); + stateChangeData->charsetAutodetected() = docShell->GetCharsetAutodetected(); RefPtr document = browsingContext->GetExtantDocument(); if (document && aStateFlags & nsIWebProgressListener::STATE_STOP) { @@ -3700,6 +3701,8 @@ NS_IMETHODIMP BrowserChild::OnLocationChange(nsIWebProgress* aWebProgress, locationChangeData->mayEnableCharacterEncodingMenu() = docShell->GetMayEnableCharacterEncodingMenu(); + locationChangeData->charsetAutodetected() = + docShell->GetCharsetAutodetected(); locationChangeData->contentPrincipal() = document->NodePrincipal(); locationChangeData->contentPartitionedPrincipal() = diff --git a/dom/ipc/BrowserParent.cpp b/dom/ipc/BrowserParent.cpp index 4cf1bf35ac89..a1e7b20c1eaf 100644 --- a/dom/ipc/BrowserParent.cpp +++ b/dom/ipc/BrowserParent.cpp @@ -2713,6 +2713,8 @@ mozilla::ipc::IPCResult BrowserParent::RecvOnStateChange( Unused << browser->SetIsNavigating(aStateChangeData->isNavigating()); Unused << browser->SetMayEnableCharacterEncodingMenu( aStateChangeData->mayEnableCharacterEncodingMenu()); + Unused << browser->SetCharsetAutodetected( + aStateChangeData->charsetAutodetected()); Unused << browser->UpdateForStateChange(aStateChangeData->charset(), aStateChangeData->documentURI(), aStateChangeData->contentType()); @@ -2781,6 +2783,7 @@ mozilla::ipc::IPCResult BrowserParent::RecvOnLocationChange( Unused << browser->UpdateForLocationChange( aLocation, aLocationChangeData->charset(), aLocationChangeData->mayEnableCharacterEncodingMenu(), + aLocationChangeData->charsetAutodetected(), aLocationChangeData->documentURI(), aLocationChangeData->title(), aLocationChangeData->contentPrincipal(), aLocationChangeData->contentPartitionedPrincipal(), diff --git a/dom/ipc/PBrowser.ipdl b/dom/ipc/PBrowser.ipdl index bf1c3752320d..718328e948c9 100644 --- a/dom/ipc/PBrowser.ipdl +++ b/dom/ipc/PBrowser.ipdl @@ -130,6 +130,7 @@ struct WebProgressStateChangeData { bool isNavigating; bool mayEnableCharacterEncodingMenu; + bool charsetAutodetected; // The following fields are only set when the aStateFlags param passed with // this struct is |nsIWebProgress.STATE_STOP|. @@ -143,6 +144,7 @@ struct WebProgressLocationChangeData bool isNavigating; bool isSyntheticDocument; bool mayEnableCharacterEncodingMenu; + bool charsetAutodetected; nsString contentType; nsString title; nsString charset; diff --git a/intl/JapaneseDetector.h b/intl/JapaneseDetector.h new file mode 100644 index 000000000000..4407016facc8 --- /dev/null +++ b/intl/JapaneseDetector.h @@ -0,0 +1,124 @@ +// Copyright 2018 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// Mostly copied and pasted from +// third_party/rust/shift_or_euc/src/lib.rs , so +// "top-level directory of this distribution" above refers to +// third_party/rust/shift_or_euc/ + +#ifndef mozilla_JapaneseDetector_h +#define mozilla_JapaneseDetector_h + +#include "mozilla/Encoding.h" + +namespace mozilla { +class JapaneseDetector; +}; // namespace mozilla + +#define SHIFT_OR_EUC_DETECTOR mozilla::JapaneseDetector + +#include "shift_or_euc.h" + +namespace mozilla { + +/** + * A Japanese legacy encoding detector for detecting between Shift_JIS, + * EUC-JP, and, optionally, ISO-2022-JP _given_ the assumption that the + * encoding is one of those. + * + * # Principle of Operation + * + * The detector is based on two observations: + * + * 1. The ISO-2022-JP escape sequences don't normally occur in Shift_JIS or + * EUC-JP, so encountering such an escape sequence (before non-ASCII has been + * encountered) can be taken as indication of ISO-2022-JP. + * 2. When normal (full-with) kana or common kanji encoded as Shift_JIS is + * decoded as EUC-JP, or vice versa, the result is either an error or + * half-width katakana, and it's very uncommon for Japanese HTML to have + * half-width katakana character before a normal kana or common kanji + * character. Therefore, if decoding as Shift_JIS results in error or + * have-width katakana, the detector decides that the content is EUC-JP, and + * vice versa. + * + * # Failure Modes + * + * The detector gives the wrong answer if the text has a half-width katakana + * character before normal kana or common kanji. Some uncommon kanji are + * undecidable. (All JIS X 0208 Level 1 kanji are decidable.) + * + * The half-width katakana issue is mainly relevant for old 8-bit JIS X + * 0201-only text files that would decode correctly as Shift_JIS but that the + * detector detects as EUC-JP. + * + * The undecidable kanji issue does not realistically show up when a full + * document is fed to the detector, because, realistically, in a full + * document, there is at least one kana or common kanji. It can occur, + * though, if the detector is only run on a prefix of a document and the + * prefix only contains the title of the document. It is possible for + * document title to consist entirely of undecidable kanji. (Indeed, + * Japanese Wikipedia has articles with such titles.) If the detector is + * undecided, a fallback to Shift_JIS should be used. + */ +class JapaneseDetector final { + public: + ~JapaneseDetector() {} + + static void operator delete(void* aDetector) { + shift_or_euc_detector_free(reinterpret_cast(aDetector)); + } + + /** + * Instantiates the detector. If `aAllow2022` is `true` the possible + * guesses are Shift_JIS, EUC-JP, ISO-2022-JP, and undecided. If + * `aAllow2022` is `false`, the possible guesses are Shift_JIS, EUC-JP, + * and undecided. + */ + static inline UniquePtr Create(bool aAllow2022) { + UniquePtr detector(shift_or_euc_detector_new(aAllow2022)); + return detector; + } + + /** + * Feeds bytes to the detector. If `aLast` is `true` the end of the stream + * is considered to occur immediately after the end of `aBuffer`. + * Otherwise, the stream is expected to continue. `aBuffer` may be empty. + * + * If you're running the detector only on a prefix of a complete + * document, _do not_ pass `aLast` as `true` after the prefix if the + * stream as a whole still contains more content. + * + * Returns `SHIFT_JIS_ENCODING` if the detector guessed + * Shift_JIS. Returns `EUC_JP_ENCODING` if the detector + * guessed EUC-JP. Returns `ISO_2022_JP_ENCODING` if the + * detector guessed ISO-2022-JP (only possible if `true` was passed as + * `aAllow2022` when instantiating the detector). Returns `nullptr` if the + * detector is undecided. If `nullptr` is returned even when passing `true` + * as `aLast`, falling back to Shift_JIS is the best guess for Web + * purposes. + * + * Do not call again after the method has returned non-`nullptr` or after + * the method has been called with `true` as `aLast`. (Asserts if the + * previous sentence isn't adhered to.) + */ + inline const mozilla::Encoding* Feed(Span aBuffer, + bool aLast) { + return shift_or_euc_detector_feed(this, aBuffer.Elements(), + aBuffer.Length(), aLast); + } + + private: + JapaneseDetector() = delete; + JapaneseDetector(const JapaneseDetector&) = delete; + JapaneseDetector& operator=(const JapaneseDetector&) = delete; +}; + +}; // namespace mozilla + +#endif // mozilla_JapaneseDetector_h diff --git a/intl/moz.build b/intl/moz.build index a180d8d418c3..0df2c86342e2 100644 --- a/intl/moz.build +++ b/intl/moz.build @@ -27,12 +27,14 @@ DIRS += [ EXPORTS.mozilla += [ "Encoding.h", "EncodingDetector.h", + "JapaneseDetector.h", ] EXPORTS += [ "../third_party/rust/chardetng_c/include/chardetng.h", "../third_party/rust/encoding_c/include/encoding_rs.h", "../third_party/rust/encoding_c/include/encoding_rs_statics.h", + "../third_party/rust/shift_or_euc_c/include/shift_or_euc.h", ] diff --git a/mobile/android/locales/l10n.toml b/mobile/android/locales/l10n.toml index 0121e67a444a..c989e006256e 100644 --- a/mobile/android/locales/l10n.toml +++ b/mobile/android/locales/l10n.toml @@ -177,6 +177,10 @@ exclude-multi-locale = [ reference = "toolkit/locales/en-US/toolkit/about/*Compat.ftl" l10n = "{l}toolkit/toolkit/about/*Compat.ftl" +[[paths]] + reference = "toolkit/locales/en-US/chrome/global/charsetMenu.properties" + l10n = "{l}toolkit/chrome/global/charsetMenu.properties" + [[paths]] reference = "toolkit/locales/en-US/chrome/global/commonDialogs.properties" l10n = "{l}toolkit/chrome/global/commonDialogs.properties" diff --git a/modules/libpref/init/StaticPrefList.yaml b/modules/libpref/init/StaticPrefList.yaml index 5c8418dd22b2..22e252315b26 100644 --- a/modules/libpref/init/StaticPrefList.yaml +++ b/modules/libpref/init/StaticPrefList.yaml @@ -5581,6 +5581,24 @@ # Prefs starting with "intl." #--------------------------------------------------------------------------- +# Whether the new encoding detector is enabled for the .jp TLD. +- name: intl.charset.detector.ng.jp.enabled + type: bool + value: true + mirror: always + +# Whether the new encoding detector is enabled for the .in TLD. +- name: intl.charset.detector.ng.in.enabled + type: bool + value: true + mirror: always + +# Whether the new encoding detector is enabled for the .lk TLD. +- name: intl.charset.detector.ng.lk.enabled + type: bool + value: true + mirror: always + # If true, dispatch the keydown and keyup events on any web apps even during # composition. - name: intl.ime.hack.on_any_apps.fire_key_events_for_composition diff --git a/parser/html/nsHtml5StreamParser.cpp b/parser/html/nsHtml5StreamParser.cpp index 8dbc2397e88d..b583608da7f4 100644 --- a/parser/html/nsHtml5StreamParser.cpp +++ b/parser/html/nsHtml5StreamParser.cpp @@ -20,6 +20,7 @@ #include "mozilla/DebugOnly.h" #include "mozilla/Encoding.h" #include "mozilla/EncodingDetector.h" +#include "mozilla/JapaneseDetector.h" #include "mozilla/Likely.h" #include "mozilla/Maybe.h" #include "mozilla/SchedulerGroup.h" @@ -306,6 +307,9 @@ int32_t nsHtml5StreamParser::MaybeRollBackSource(int32_t aSource) { } void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) { + if (mJapaneseDetector) { + return; + } if (aInitial) { if (!mDetectorHasSeenNonAscii) { mDetectorHadOnlySeenAsciiWhenFirstGuessing = true; @@ -316,6 +320,7 @@ void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) { bool forced = (mCharsetSource == kCharsetFromPendingUserForcedAutoDetection || mCharsetSource == kCharsetFromInitialUserForcedAutoDetection); MOZ_ASSERT( + mCharsetSource != kCharsetFromFinalJapaneseAutoDetection && mCharsetSource != kCharsetFromFinalUserForcedAutoDetection && mCharsetSource != kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 && mCharsetSource != @@ -380,7 +385,8 @@ void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) { mCharsetSource = MaybeRollBackSource(source); mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource); } else { - MOZ_ASSERT(mCharsetSource < kCharsetFromXmlDeclarationUtf16 || forced); + MOZ_ASSERT(mCharsetSource < kCharsetFromFinalJapaneseAutoDetection || + forced); // We've already committed to a decoder. Request a reload from the // docshell. mTreeBuilder->NeedsCharsetSwitchTo(encoding, source, 0); @@ -401,9 +407,44 @@ void nsHtml5StreamParser::GuessEncoding(bool aEof, bool aInitial) { } } +void nsHtml5StreamParser::FeedJapaneseDetector(Span aBuffer, + bool aLast) { + MOZ_ASSERT(!mDecodingLocalFileWithoutTokenizing); + const Encoding* detected = mJapaneseDetector->Feed(aBuffer, aLast); + if (!detected) { + return; + } + DontGuessEncoding(); + int32_t source = kCharsetFromFinalJapaneseAutoDetection; + if (mCharsetSource == kCharsetFromUserForced) { + source = kCharsetFromUserForcedJapaneseAutoDetection; + } + if (detected == mEncoding) { + MOZ_ASSERT(mCharsetSource < source, "Why are we running chardet at all?"); + mCharsetSource = source; + mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource); + } else if (HasDecoder()) { + // We've already committed to a decoder. Request a reload from the + // docshell. + mTreeBuilder->NeedsCharsetSwitchTo(WrapNotNull(detected), source, 0); + FlushTreeOpsAndDisarmTimer(); + Interrupt(); + } else { + // Got a confident answer from the sniffing buffer. That code will + // take care of setting up the decoder. + mEncoding = WrapNotNull(detected); + mCharsetSource = source; + mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource); + } +} + void nsHtml5StreamParser::FeedDetector(Span aBuffer, bool aLast) { - mDetectorHasSeenNonAscii = mDetector->Feed(aBuffer, aLast); + if (mJapaneseDetector) { + FeedJapaneseDetector(aBuffer, aLast); + } else { + mDetectorHasSeenNonAscii = mDetector->Feed(aBuffer, aLast); + } } void nsHtml5StreamParser::SetViewSourceTitle(nsIURI* aURL) { @@ -450,12 +491,13 @@ nsHtml5StreamParser::SetupDecodingAndWriteSniffingBufferAndCurrentSegment( NS_ASSERTION(IsParserThread(), "Wrong thread!"); nsresult rv = NS_OK; if (mDecodingLocalFileWithoutTokenizing && - mCharsetSource <= kCharsetFromFallback) { + mCharsetSource <= kCharsetFromTopLevelDomain) { MOZ_ASSERT(mEncoding != UTF_8_ENCODING); mUnicodeDecoder = UTF_8_ENCODING->NewDecoderWithBOMRemoval(); } else { - if (mCharsetSource >= kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8) { - if (!(mCharsetSource == kCharsetFromPendingUserForcedAutoDetection || + if (mCharsetSource >= kCharsetFromFinalJapaneseAutoDetection) { + if (!(mCharsetSource == kCharsetFromUserForced || + mCharsetSource == kCharsetFromPendingUserForcedAutoDetection || mCharsetSource == kCharsetFromInitialUserForcedAutoDetection)) { DontGuessEncoding(); } @@ -639,7 +681,7 @@ nsresult nsHtml5StreamParser::FinalizeSniffing(Span aFromSegment, uint32_t aCountToSniffingLimit, bool aEof) { MOZ_ASSERT(IsParserThread(), "Wrong thread!"); - MOZ_ASSERT(mCharsetSource < kCharsetFromXmlDeclarationUtf16, + MOZ_ASSERT(mCharsetSource < kCharsetFromUserForcedJapaneseAutoDetection, "Should not finalize sniffing with strong decision already made."); if (mMode == VIEW_SOURCE_XML) { static const XML_Memory_Handling_Suite memsuite = { @@ -702,9 +744,12 @@ nsresult nsHtml5StreamParser::FinalizeSniffing(Span aFromSegment, return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment); } - bool forced = (mCharsetSource == kCharsetFromPendingUserForcedAutoDetection || - mCharsetSource == kCharsetFromInitialUserForcedAutoDetection || - mCharsetSource == kCharsetFromFinalUserForcedAutoDetection); + bool forced = + (mCharsetSource == kCharsetFromUserForced || + mCharsetSource == kCharsetFromUserForcedJapaneseAutoDetection || + mCharsetSource == kCharsetFromPendingUserForcedAutoDetection || + mCharsetSource == kCharsetFromInitialUserForcedAutoDetection || + mCharsetSource == kCharsetFromFinalUserForcedAutoDetection); if (!mChannelHadCharset && (forced || mCharsetSource < kCharsetFromMetaPrescan) && (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA)) { @@ -731,7 +776,17 @@ nsresult nsHtml5StreamParser::FinalizeSniffing(Span aFromSegment, if (forced && (encoding->IsAsciiCompatible() || encoding == ISO_2022_JP_ENCODING)) { // Honor override - if (mCharsetSource == kCharsetFromFinalUserForcedAutoDetection) { + if (mCharsetSource == kCharsetFromUserForced && + mEncoding->IsJapaneseLegacy()) { + mFeedChardet = true; + if (!mJapaneseDetector) { + mJapaneseDetector = mozilla::JapaneseDetector::Create(true); + } + FinalizeSniffingWithDetector(aFromSegment, aCountToSniffingLimit, + false); + } else if (mCharsetSource == + kCharsetFromUserForcedJapaneseAutoDetection || + mCharsetSource == kCharsetFromFinalUserForcedAutoDetection) { DontGuessEncoding(); } else { FinalizeSniffingWithDetector(aFromSegment, aCountToSniffingLimit, @@ -753,7 +808,15 @@ nsresult nsHtml5StreamParser::FinalizeSniffing(Span aFromSegment, } if (forced && mCharsetSource != kCharsetFromIrreversibleAutoDetection) { // neither meta nor XML declaration found, honor override - if (mCharsetSource == kCharsetFromFinalUserForcedAutoDetection) { + if (mCharsetSource == kCharsetFromUserForced && + mEncoding->IsJapaneseLegacy()) { + mFeedChardet = true; + if (!mJapaneseDetector) { + mJapaneseDetector = mozilla::JapaneseDetector::Create(true); + } + FinalizeSniffingWithDetector(aFromSegment, aCountToSniffingLimit, false); + } else if (mCharsetSource == kCharsetFromUserForcedJapaneseAutoDetection || + mCharsetSource == kCharsetFromFinalUserForcedAutoDetection) { DontGuessEncoding(); } else { FinalizeSniffingWithDetector(aFromSegment, aCountToSniffingLimit, false); @@ -958,7 +1021,9 @@ nsresult nsHtml5StreamParser::SniffStreamBytes( // this is the last buffer uint32_t countToSniffingLimit = SNIFFING_BUFFER_SIZE - mSniffingLength; bool forced = - (mCharsetSource == kCharsetFromPendingUserForcedAutoDetection || + (mCharsetSource == kCharsetFromUserForced || + mCharsetSource == kCharsetFromUserForcedJapaneseAutoDetection || + mCharsetSource == kCharsetFromPendingUserForcedAutoDetection || mCharsetSource == kCharsetFromInitialUserForcedAutoDetection || mCharsetSource == kCharsetFromFinalUserForcedAutoDetection); if (!mChannelHadCharset && (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || @@ -980,7 +1045,18 @@ nsresult nsHtml5StreamParser::SniffStreamBytes( if (forced && (encoding->IsAsciiCompatible() || encoding == ISO_2022_JP_ENCODING)) { // Honor override - if (mCharsetSource == kCharsetFromFinalUserForcedAutoDetection) { + if (mCharsetSource == kCharsetFromUserForced && + mEncoding->IsJapaneseLegacy()) { + mFeedChardet = true; + if (!mJapaneseDetector) { + mJapaneseDetector = mozilla::JapaneseDetector::Create(true); + } + FinalizeSniffingWithDetector(aFromSegment, countToSniffingLimit, + false); + } else if (mCharsetSource == + kCharsetFromUserForcedJapaneseAutoDetection || + mCharsetSource == + kCharsetFromFinalUserForcedAutoDetection) { DontGuessEncoding(); } else { FinalizeSniffingWithDetector(aFromSegment, countToSniffingLimit, @@ -1015,7 +1091,9 @@ nsresult nsHtml5StreamParser::SniffStreamBytes( } if (encoding) { // meta scan successful; honor overrides unless meta is XSS-dangerous - if ((mCharsetSource == kCharsetFromFinalUserForcedAutoDetection) && + if ((mCharsetSource == kCharsetFromUserForced || + mCharsetSource == kCharsetFromUserForcedJapaneseAutoDetection || + mCharsetSource == kCharsetFromFinalUserForcedAutoDetection) && (encoding->IsAsciiCompatible() || encoding == ISO_2022_JP_ENCODING)) { // Honor override return SetupDecodingAndWriteSniffingBufferAndCurrentSegment( @@ -1215,7 +1293,7 @@ nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) { // let's instantiate only if we make it out of this method with the // intent to use it. auto detectorCreator = MakeScopeExit([&] { - if (mFeedChardet) { + if (mFeedChardet && !mJapaneseDetector) { mDetector = mozilla::EncodingDetector::Create(); } }); @@ -1243,7 +1321,7 @@ nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) { nsresult rv = GetChannel(getter_AddRefs(channel)); if (NS_SUCCEEDED(rv)) { isSrcdoc = NS_IsSrcdocChannel(channel); - if (!isSrcdoc && mCharsetSource <= kCharsetFromFallback) { + if (!isSrcdoc && mCharsetSource <= kCharsetFromTopLevelDomain) { nsCOMPtr originalURI; rv = channel->GetOriginalURI(getter_AddRefs(originalURI)); if (NS_SUCCEEDED(rv)) { @@ -1379,8 +1457,44 @@ nsresult nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest) { if (!(mCharsetSource == kCharsetFromPendingUserForcedAutoDetection || mCharsetSource == kCharsetFromInitialUserForcedAutoDetection || mCharsetSource == kCharsetFromFinalUserForcedAutoDetection)) { - if (mCharsetSource >= kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8) { - DontGuessEncoding(); + if (mCharsetSource >= kCharsetFromFinalJapaneseAutoDetection) { + if ((mCharsetSource == kCharsetFromUserForced) && + mEncoding->IsJapaneseLegacy()) { + // Japanese detector only + if (!mJapaneseDetector) { + mJapaneseDetector = mozilla::JapaneseDetector::Create(true); + } + mGuessEncoding = false; + } else { + DontGuessEncoding(); + } + } + + // Compute various pref-based special cases + if (!mDecodingLocalFileWithoutTokenizing && mFeedChardet) { + if (mTLD.EqualsLiteral("jp")) { + if (!mJapaneseDetector && + !StaticPrefs::intl_charset_detector_ng_jp_enabled()) { + mJapaneseDetector = mozilla::JapaneseDetector::Create(true); + } + if (mJapaneseDetector && mEncoding == WINDOWS_1252_ENCODING && + mCharsetSource <= kCharsetFromTopLevelDomain) { + mCharsetSource = kCharsetFromTopLevelDomain; + mEncoding = SHIFT_JIS_ENCODING; + mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource); + } + } else if ((mTLD.EqualsLiteral("in") && + !StaticPrefs::intl_charset_detector_ng_in_enabled()) || + (mTLD.EqualsLiteral("lk") && + !StaticPrefs::intl_charset_detector_ng_lk_enabled())) { + if (mEncoding == WINDOWS_1252_ENCODING && + mCharsetSource <= kCharsetFromTopLevelDomain) { + // Avoid breaking font hacks that Chrome doesn't break. + mCharsetSource = kCharsetFromTopLevelDomain; + mTreeBuilder->SetDocumentCharset(mEncoding, mCharsetSource); + } + DontGuessEncoding(); + } } } diff --git a/parser/html/nsHtml5StreamParser.h b/parser/html/nsHtml5StreamParser.h index 4d59dc238248..b62e15045eb6 100644 --- a/parser/html/nsHtml5StreamParser.h +++ b/parser/html/nsHtml5StreamParser.h @@ -45,6 +45,7 @@ class nsIURI; namespace mozilla { class EncodingDetector; +class JapaneseDetector; template class Buffer; @@ -218,6 +219,11 @@ class nsHtml5StreamParser final : public nsISupports { // Not from an external interface + /** + * Pass a buffer to the JapaneseDetector. + */ + void FeedJapaneseDetector(mozilla::Span aBuffer, bool aLast); + /** * Pass a buffer to the Japanese or Cyrillic detector as appropriate. */ @@ -652,6 +658,11 @@ class nsHtml5StreamParser final : public nsISupports { nsCOMPtr mLoadFlusher; + /** + * The Japanese detector. + */ + mozilla::UniquePtr mJapaneseDetector; + /** * The generict detector. */ diff --git a/parser/nsCharsetSource.h b/parser/nsCharsetSource.h index 0d1f5e2da5eb..f5128e1cff73 100644 --- a/parser/nsCharsetSource.h +++ b/parser/nsCharsetSource.h @@ -9,6 +9,7 @@ enum { kCharsetUninitialized, kCharsetFromFallback, + kCharsetFromTopLevelDomain, kCharsetFromDocTypeDefault, // This and up confident for XHR // Start subdividing source for telementry purposes kCharsetFromInitialAutoDetectionASCII, @@ -16,6 +17,7 @@ enum { kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic, kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content, kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD, + kCharsetFromFinalJapaneseAutoDetection, // Deliberately no Final version of ASCII kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8, kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic, @@ -35,6 +37,8 @@ enum { // later kCharsetFromInitialUserForcedAutoDetection, kCharsetFromFinalUserForcedAutoDetection, + kCharsetFromUserForced, // propagates to child frames + kCharsetFromUserForcedJapaneseAutoDetection, kCharsetFromXmlDeclarationUtf16, // This one is overridden by // kCharsetFromChannel kCharsetFromIrreversibleAutoDetection, // This one is overridden by diff --git a/third_party/rust/shift_or_euc/.cargo-checksum.json b/third_party/rust/shift_or_euc/.cargo-checksum.json new file mode 100644 index 000000000000..cc5816349a54 --- /dev/null +++ b/third_party/rust/shift_or_euc/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"CONTRIBUTING.md":"0e64fb3dd5a00e3fd528de6442de3f2ca851bd718c45cca0871aaf4eedac9ee1","COPYRIGHT":"3a7313aa2f19bf7095a2fd731c3d5e76f38d5e4640bd2a115d53032f24b2aa6c","Cargo.toml":"f9f41b76ecbe257a312ab09ed1208189b8dc9952d12d17a216fe2846d1d471c8","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"dac4dde23582d18b01701032860d8f8a1979fb2cf626060ca8de77e081a2a3d5","README.md":"b7148745a7ef59788e76fbe638d4b41c54dcaa1313a809f4630a020645f892a8","examples/detect.rs":"eb7239ccc802290ef24331db600ca1226198801dd86df86876b4b738ef4b8470","src/lib.rs":"f2a83db125d553af5c6fabae0487ef211aad62f2d93c4418dc510cbd425d472a"},"package":"f930dea4685b9803954b9d74cdc175c6d946a22f2eafe5aa2e9a58cdcae7da8c"} \ No newline at end of file diff --git a/third_party/rust/shift_or_euc/CONTRIBUTING.md b/third_party/rust/shift_or_euc/CONTRIBUTING.md new file mode 100644 index 000000000000..1d41d4c60ecc --- /dev/null +++ b/third_party/rust/shift_or_euc/CONTRIBUTING.md @@ -0,0 +1,38 @@ +If you send a pull request / patch, please observe the following. + +## Licensing + +Since this crate is dual-licensed, +[section 5 of the Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0#contributions) +is considered to apply in the sense of Contributions being automatically +under the Apache License 2.0 or MIT dual license (see the `COPYRIGHT` file). +That is, by the act of offering a Contribution, you place your Contribution +under the Apache License 2.0 or MIT dual license stated in the `COPYRIGHT` +file. Please do not contribute if you aren't willing or allowed to license your +contributions in this manner. + +You are encouraged to dedicate test code that you contribute to the Public +Domain using the CC0 dedication. If you contribute test code that is not +dedicated to the Public Domain, please be sure not to put it in a part of +source code that the comments designate as being dedicated to the Public +Domain. + +## Copyright Notices + +If you require the addition of your copyright notice, it's up to you to edit in +your notice as part of your Contribution. Not adding a copyright notice is +taken as a waiver of copyright notice. + +## Compatibility with Stable Rust + +Please ensure that your Contribution compiles with the latest stable-channel +rustc. + +## rustfmt + +The `rustfmt` version used for this code is `rustfmt-nightly`. Please either +use that version or avoid using `rustfmt` (so as not to reformat all the code). + +## Unit tests + +Please ensure that `cargo test` succeeds. diff --git a/third_party/rust/shift_or_euc/COPYRIGHT b/third_party/rust/shift_or_euc/COPYRIGHT new file mode 100644 index 000000000000..1cacb3eb05ef --- /dev/null +++ b/third_party/rust/shift_or_euc/COPYRIGHT @@ -0,0 +1,9 @@ +shift_or_euc is copyright 2018 Mozilla Foundation. + +Licensed under the Apache License, Version 2.0 + or the MIT +license , +at your option. All files in the project carrying such +notice may not be copied, modified, or distributed except +according to those terms. diff --git a/third_party/rust/shift_or_euc/Cargo.toml b/third_party/rust/shift_or_euc/Cargo.toml new file mode 100644 index 000000000000..b28ae619a225 --- /dev/null +++ b/third_party/rust/shift_or_euc/Cargo.toml @@ -0,0 +1,30 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +edition = "2018" +name = "shift_or_euc" +version = "0.1.0" +authors = ["Henri Sivonen "] +description = "Detects among the Japanese legacy encodings" +homepage = "https://docs.rs/shift_or_euc/" +documentation = "https://docs.rs/shift_or_euc/" +readme = "README.md" +keywords = ["encoding", "web", "charset"] +categories = ["text-processing", "encoding", "web-programming", "internationalization"] +license = "MIT/Apache-2.0" +repository = "https://github.com/hsivonen/shift_or_euc" +[dependencies.encoding_rs] +version = "0.8.17" + +[dependencies.memchr] +version = "2.2.0" diff --git a/third_party/rust/shift_or_euc/LICENSE-APACHE b/third_party/rust/shift_or_euc/LICENSE-APACHE new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/third_party/rust/shift_or_euc/LICENSE-APACHE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/third_party/rust/shift_or_euc/LICENSE-MIT b/third_party/rust/shift_or_euc/LICENSE-MIT new file mode 100644 index 000000000000..9ac617754c14 --- /dev/null +++ b/third_party/rust/shift_or_euc/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2018 Mozilla Foundation + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/third_party/rust/shift_or_euc/README.md b/third_party/rust/shift_or_euc/README.md new file mode 100644 index 000000000000..e0c77d6fcc52 --- /dev/null +++ b/third_party/rust/shift_or_euc/README.md @@ -0,0 +1,73 @@ +# shift_or_euc + +[![Apache 2 / MIT dual-licensed](https://img.shields.io/badge/license-Apache%202%20%2F%20MIT-blue.svg)](https://github.com/hsivonen/shift_or_euc/blob/master/COPYRIGHT) + +A Japanese legacy encoding detector for detecting between Shift_JIS, EUC-JP, +and, optionally, ISO-2022-JP _given_ the assumption that the encoding is one +of those. + +This detector is generally more accurate (but see below about the failure +mode on half-width katakana) and decides much sooner than machine +learning-based detectors. To decide EUC-JP, machine learning-based detectors +try to gain confidence that the input looks like EUC-JP. To decide EUC-JP, +this detector instead looks for two simple rule-based signs of the input not +being Shift_JIS. + +As a consequence of not containing machine learning tables, the binary size +footprint that this crate adds on top of +[`encoding_rs`](https://docs.rs/crate/encoding_rs) is tiny. + +## Documentation + +[API documentation on docs.rs](https://docs.rs/crate/shift_or_euc) + +## Licensing + +See the file named [COPYRIGHT](https://github.com/hsivonen/shift_or_euc/blob/master/COPYRIGHT). + +## Sample Program Usage + +1. [Install Rust](https://rustup.rs/) +2. `git clone https://github.com/hsivonen/shift_or_euc` +3. `cd shift_or_euc` +4. `cargo run --example detect PATH_TO_FILE` + +The program prints one of: + +* Shift_JIS +* EUC-JP +* ISO-2022-JP +* Undecided + +## Principle of Operation + +The detector is based on two observations: + +1. The ISO-2022-JP escape sequences don't normally occur in Shift_JIS or +EUC-JP, so encountering such an escape sequence (before non-ASCII has been +encountered) can be taken as indication of ISO-2022-JP. +2. When normal (full-with) kana or common kanji encoded as Shift_JIS is +decoded as EUC-JP, or vice versa, the result is either an error or half-width +katakana, and it's very uncommon for Japanese HTML to have half-width katakana +character before a normal kana or common kanji character. Therefore, if +decoding as Shift_JIS results in error or have-width katakana, the detector +decides that the content is EUC-JP, and vice versa. + +## Failure Modes + +The detector gives the wrong answer if the text has a half-width katakana +character before normal kana or common kanji. Some uncommon kanji are +undecidable. (All JIS X 0208 Level 1 kanji are decidable.) + +The half-width katakana issue is mainly relevant for old 8-bit JIS X 0201-only +text files that would decode correctly as Shift_JIS but that the detector +detects as EUC-JP. + +The undecidable kanji issue does not realistically show up when a full +document is fed to the detector, because, realistically, in a full document, +there is at least one kana or common kanji. It can occur, though, if the +detector is only run on a prefix of a document and the prefix only contains +the title of the document. It is possible for document title to consist +entirely of undecidable kanji. (Indeed, Japanese Wikipedia has articles with +such titles.) If the detector is undecided, falling back to Shift_JIS is +typically the Web oriented better guess. \ No newline at end of file diff --git a/third_party/rust/shift_or_euc/examples/detect.rs b/third_party/rust/shift_or_euc/examples/detect.rs new file mode 100644 index 000000000000..9ab21a3561b4 --- /dev/null +++ b/third_party/rust/shift_or_euc/examples/detect.rs @@ -0,0 +1,56 @@ +// Copyright 2018 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::fs::File; +use std::io::Read; + +use shift_or_euc::Detector; + +fn main() { + let mut args = std::env::args_os(); + if args.next().is_none() { + eprintln!("Error: Program name missing from arguments."); + std::process::exit(-1); + } + if let Some(path) = args.next() { + if args.next().is_some() { + eprintln!("Error: Too many arguments."); + std::process::exit(-3); + } + if let Ok(mut file) = File::open(path) { + let mut buffer = [0u8; 4096]; + let mut detector = Detector::new(true); + loop { + if let Ok(num_read) = file.read(&mut buffer[..]) { + let opt_enc = if num_read == 0 { + detector.feed(b"", true) + } else { + detector.feed(&buffer[..num_read], false) + }; + if let Some(encoding) = opt_enc { + println!("{}", encoding.name()); + return; + } else if num_read == 0 { + println!("Undecided"); + return; + } + } else { + eprintln!("Error: Error reading file."); + std::process::exit(-5); + } + } + } else { + eprintln!("Error: Could not open file."); + std::process::exit(-4); + } + } else { + eprintln!("Error: One path argument needed."); + std::process::exit(-2); + } +} diff --git a/third_party/rust/shift_or_euc/src/lib.rs b/third_party/rust/shift_or_euc/src/lib.rs new file mode 100644 index 000000000000..978fc7f27e0f --- /dev/null +++ b/third_party/rust/shift_or_euc/src/lib.rs @@ -0,0 +1,278 @@ +// Copyright 2018 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![doc(html_root_url = "https://docs.rs/shift_or_euc/0.1.0")] + +//! A Japanese legacy encoding detector for detecting between Shift_JIS, +//! EUC-JP, and, optionally, ISO-2022-JP _given_ the assumption that the +//! encoding is one of those. +//! +//! This detector is generally more accurate (but see below about the failure +//! mode on half-width katakana) and decides much sooner than machine +//! learning-based detectors. To decide EUC-JP, machine learning-based +//! detectors try to gain confidence that the input looks like EUC-JP. To +//! decide EUC-JP, this detector instead looks for two simple rule-based +//! signs of the input not being Shift_JIS. +//! +//! As a consequence of not containing machine learning tables, the binary +//! size footprint that this crate adds on top of +//! [`encoding_rs`](https://docs.rs/crate/encoding_rs) is tiny. +//! +//! # Licensing +//! +//! See the file named [COPYRIGHT](https://github.com/hsivonen/shift_or_euc/blob/master/COPYRIGHT). +//! +//! # Principle of Operation +//! +//! The detector is based on two observations: +//! +//! 1. The ISO-2022-JP escape sequences don't normally occur in Shift_JIS or +//! EUC-JP, so encountering such an escape sequence (before non-ASCII has been +//! encountered) can be taken as indication of ISO-2022-JP. +//! 2. When normal (full-with) kana or common kanji encoded as Shift_JIS is +//! decoded as EUC-JP, or vice versa, the result is either an error or +//! half-width katakana, and it's very uncommon for Japanese HTML to have +//! half-width katakana character before a normal kana or common kanji +//! character. Therefore, if decoding as Shift_JIS results in error or +//! have-width katakana, the detector decides that the content is EUC-JP, and +//! vice versa. +//! +//! # Failure Modes +//! +//! The detector gives the wrong answer if the text has a half-width katakana +//! character before normal kana or common kanji. Some uncommon kanji are +//! undecidable. (All JIS X 0208 Level 1 kanji are decidable.) +//! +//! The half-width katakana issue is mainly relevant for old 8-bit JIS X +//! 0201-only text files that would decode correctly as Shift_JIS but that the +//! detector detects as EUC-JP. +//! +//! The undecidable kanji issue does not realistically show up when a full +//! document is fed to the detector, because, realistically, in a full +//! document, there is at least one kana or common kanji. It can occur, +//! though, if the detector is only run on a prefix of a document and the +//! prefix only contains the title of the document. It is possible for +//! document title to consist entirely of undecidable kanji. (Indeed, +//! Japanese Wikipedia has articles with such titles.) If the detector is +//! undecided, falling back to Shift_JIS is typically the Web oriented better +//! guess. + +use encoding_rs::Decoder; +use encoding_rs::DecoderResult; +use encoding_rs::Encoding; +use encoding_rs::EUC_JP; +use encoding_rs::ISO_2022_JP; +use encoding_rs::SHIFT_JIS; + +/// Returns the index of the first non-ASCII byte or the first +/// 0x1B, whichever comes first, or the length of the buffer +/// if neither is found. +fn find_non_ascii_or_escape(buffer: &[u8]) -> usize { + let ascii_up_to = Encoding::ascii_valid_up_to(buffer); + if let Some(escape) = memchr::memchr(0x1B, &buffer[..ascii_up_to]) { + escape + } else { + ascii_up_to + } +} + +/// Feed decoder with one byte (if `last` is `false`) or EOF (if `last` is +/// `true`). `byte` is ignored if `last` is `true`. +/// Returns `true` if there was no rejection or `false` upon rejecting the +/// encoding hypothesis represented by this decoder. +#[inline(always)] +fn feed_decoder(decoder: &mut Decoder, byte: u8, last: bool) -> bool { + let mut output = [0u16; 1]; + let input = [byte]; + let (result, _read, written) = decoder.decode_to_utf16_without_replacement( + if last { b"" } else { &input }, + &mut output, + last, + ); + match result { + DecoderResult::InputEmpty => { + if written == 1 { + match output[0] { + 0xFF61...0xFF9F => { + return false; + } + _ => {} + } + } + } + DecoderResult::Malformed(_, _) => { + return false; + } + DecoderResult::OutputFull => { + unreachable!(); + } + } + true +} + +/// A detector for detecting the character encoding of input on the +/// precondition that the encoding is a Japanese legacy encoding. +pub struct Detector { + shift_jis_decoder: Decoder, + euc_jp_decoder: Decoder, + second_byte_in_escape: u8, + iso_2022_jp_disqualified: bool, + escape_seen: bool, + finished: bool, +} + +impl Detector { + /// Instantiates the detector. If `allow_2022` is `true` the possible + /// guesses are Shift_JIS, EUC-JP, ISO-2022-JP, and undecided. If + /// `allow_2022` is `false`, the possible guesses are Shift_JIS, EUC-JP, + /// and undecided. + pub fn new(allow_2022: bool) -> Self { + Detector { + shift_jis_decoder: SHIFT_JIS.new_decoder_without_bom_handling(), + euc_jp_decoder: EUC_JP.new_decoder_without_bom_handling(), + second_byte_in_escape: 0, + iso_2022_jp_disqualified: !allow_2022, + escape_seen: false, + finished: false, + } + } + + /// Feeds bytes to the detector. If `last` is `true` the end of the stream + /// is considered to occur immediately after the end of `buffer`. + /// Otherwise, the stream is expected to continue. `buffer` may be empty. + /// + /// If you're running the detector only on a prefix of a complete + /// document, _do not_ pass `last` as `true` after the prefix if the + /// stream as a whole still contains more content. + /// + /// Returns `Some(encoding_rs::SHIFT_JIS)` if the detector guessed + /// Shift_JIS. Returns `Some(encoding_rs::EUC_JP)` if the detector + /// guessed EUC-JP. Returns `Some(encoding_rs::ISO_2022_JP)` if the + /// detector guessed ISO-2022-JP (only possible if `true` was passed as + /// `allow_2022` when instantiating the detector). Returns `None` if the + /// detector is undecided. If `None` is returned even when passing `true` + /// as `last`, falling back to Shift_JIS is the best guess for Web + /// purposes. + /// + /// Do not call again after the method has returned `Some(_)` or after + /// the method has been called with `true` as `last`. + /// + /// # Panics + /// + /// If called after the method has returned `Some(_)` or after the method + /// has been called with `true` as `last`. + pub fn feed(&mut self, buffer: &[u8], last: bool) -> Option<&'static Encoding> { + assert!( + !self.finished, + "Tried to used a detector that has finished." + ); + self.finished = true; // Will change back to false unless we return early + let mut i = 0; + if !self.iso_2022_jp_disqualified { + if !self.escape_seen { + i = find_non_ascii_or_escape(buffer); + } + while i < buffer.len() { + let byte = buffer[i]; + if byte > 0x7F { + self.iso_2022_jp_disqualified = true; + break; + } + if !self.escape_seen && byte == 0x1B { + self.escape_seen = true; + i += 1; + continue; + } + if self.escape_seen && self.second_byte_in_escape == 0 { + self.second_byte_in_escape = byte; + i += 1; + continue; + } + match (self.second_byte_in_escape, byte) { + (0x28, 0x42) | (0x28, 0x4A) | (0x28, 0x49) | (0x24, 0x40) | (0x24, 0x42) => { + return Some(ISO_2022_JP); + } + _ => {} + } + if self.escape_seen { + self.iso_2022_jp_disqualified = true; + break; + } + i += 1; + } + } + for &byte in &buffer[i..] { + if !feed_decoder(&mut self.euc_jp_decoder, byte, false) { + return Some(SHIFT_JIS); + } + if !feed_decoder(&mut self.shift_jis_decoder, byte, false) { + return Some(EUC_JP); + } + } + if last { + if !feed_decoder(&mut self.euc_jp_decoder, 0, true) { + return Some(SHIFT_JIS); + } + if !feed_decoder(&mut self.shift_jis_decoder, 0, true) { + return Some(EUC_JP); + } + return None; + } + self.finished = false; + None + } +} + +// Any copyright to the test code below this comment is dedicated to the +// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_iso_2022_jp() { + let mut detector = Detector::new(true); + assert_eq!( + detector.feed(b"abc\x1B\x28\x42\xFF", true), + Some(ISO_2022_JP) + ); + } + + #[test] + fn test_error_precedence() { + let mut detector = Detector::new(true); + assert_eq!(detector.feed(b"abc\xFF", true), Some(SHIFT_JIS)); + } + + #[test] + fn test_invalid_euc_jp() { + let mut detector = Detector::new(true); + assert_eq!(detector.feed(b"abc\x81\x40", true), Some(SHIFT_JIS)); + } + + #[test] + fn test_invalid_shift_jis() { + let mut detector = Detector::new(true); + assert_eq!(detector.feed(b"abc\xEB\xA8", true), Some(EUC_JP)); + } + + #[test] + fn test_invalid_shift_jis_before_invalid_euc_jp() { + let mut detector = Detector::new(true); + assert_eq!(detector.feed(b"abc\xEB\xA8\x81\x40", true), Some(EUC_JP)); + } + + #[test] + fn test_undecided() { + let mut detector = Detector::new(true); + assert_eq!(detector.feed(b"abc", false), None); + assert_eq!(detector.feed(b"abc", false), None); + } + +} diff --git a/third_party/rust/shift_or_euc_c/.cargo-checksum.json b/third_party/rust/shift_or_euc_c/.cargo-checksum.json new file mode 100644 index 000000000000..1ba22660be26 --- /dev/null +++ b/third_party/rust/shift_or_euc_c/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"CONTRIBUTING.md":"0e64fb3dd5a00e3fd528de6442de3f2ca851bd718c45cca0871aaf4eedac9ee1","COPYRIGHT":"3a7313aa2f19bf7095a2fd731c3d5e76f38d5e4640bd2a115d53032f24b2aa6c","Cargo.toml":"342e5345f4fb433b89f397b07e4e7162376b30cbbc1d6f6ccb11523116e6ed6b","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"dac4dde23582d18b01701032860d8f8a1979fb2cf626060ca8de77e081a2a3d5","README.md":"a323f1f4537bc7b3f9b3b216c8ac5041b83aa0321f5349a52627aade947c6272","include/shift_or_euc.h":"47c3b9832cb7eb8995aa37dcc2e76be7d4f5c7b3fa6b43135e579831ab449cd8","src/lib.rs":"cab1898dd6724e0a0324a1e44f6348c107f13916da8873dba69c70dbc95ba9cd"},"package":"c81ec08c8a68c45c48d8ef58b80ce038cc9945891c4a4996761e2ec5cba05abc"} \ No newline at end of file diff --git a/third_party/rust/shift_or_euc_c/CONTRIBUTING.md b/third_party/rust/shift_or_euc_c/CONTRIBUTING.md new file mode 100644 index 000000000000..1d41d4c60ecc --- /dev/null +++ b/third_party/rust/shift_or_euc_c/CONTRIBUTING.md @@ -0,0 +1,38 @@ +If you send a pull request / patch, please observe the following. + +## Licensing + +Since this crate is dual-licensed, +[section 5 of the Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0#contributions) +is considered to apply in the sense of Contributions being automatically +under the Apache License 2.0 or MIT dual license (see the `COPYRIGHT` file). +That is, by the act of offering a Contribution, you place your Contribution +under the Apache License 2.0 or MIT dual license stated in the `COPYRIGHT` +file. Please do not contribute if you aren't willing or allowed to license your +contributions in this manner. + +You are encouraged to dedicate test code that you contribute to the Public +Domain using the CC0 dedication. If you contribute test code that is not +dedicated to the Public Domain, please be sure not to put it in a part of +source code that the comments designate as being dedicated to the Public +Domain. + +## Copyright Notices + +If you require the addition of your copyright notice, it's up to you to edit in +your notice as part of your Contribution. Not adding a copyright notice is +taken as a waiver of copyright notice. + +## Compatibility with Stable Rust + +Please ensure that your Contribution compiles with the latest stable-channel +rustc. + +## rustfmt + +The `rustfmt` version used for this code is `rustfmt-nightly`. Please either +use that version or avoid using `rustfmt` (so as not to reformat all the code). + +## Unit tests + +Please ensure that `cargo test` succeeds. diff --git a/third_party/rust/shift_or_euc_c/COPYRIGHT b/third_party/rust/shift_or_euc_c/COPYRIGHT new file mode 100644 index 000000000000..1cacb3eb05ef --- /dev/null +++ b/third_party/rust/shift_or_euc_c/COPYRIGHT @@ -0,0 +1,9 @@ +shift_or_euc is copyright 2018 Mozilla Foundation. + +Licensed under the Apache License, Version 2.0 + or the MIT +license , +at your option. All files in the project carrying such +notice may not be copied, modified, or distributed except +according to those terms. diff --git a/third_party/rust/shift_or_euc_c/Cargo.toml b/third_party/rust/shift_or_euc_c/Cargo.toml new file mode 100644 index 000000000000..a7c91be4548e --- /dev/null +++ b/third_party/rust/shift_or_euc_c/Cargo.toml @@ -0,0 +1,30 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +edition = "2018" +name = "shift_or_euc_c" +version = "0.1.0" +authors = ["Henri Sivonen "] +description = "C API for shift_or_euc" +homepage = "https://docs.rs/shift_or_euc_c/" +documentation = "https://docs.rs/shift_or_euc_c/" +readme = "README.md" +keywords = ["encoding", "web", "charset"] +categories = ["text-processing", "encoding", "web-programming", "internationalization"] +license = "MIT/Apache-2.0" +repository = "https://github.com/hsivonen/shift_or_euc_c" +[dependencies.encoding_rs] +version = "0.8.17" + +[dependencies.shift_or_euc] +version = "0.1.0" diff --git a/third_party/rust/shift_or_euc_c/LICENSE-APACHE b/third_party/rust/shift_or_euc_c/LICENSE-APACHE new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/third_party/rust/shift_or_euc_c/LICENSE-APACHE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/third_party/rust/shift_or_euc_c/LICENSE-MIT b/third_party/rust/shift_or_euc_c/LICENSE-MIT new file mode 100644 index 000000000000..9ac617754c14 --- /dev/null +++ b/third_party/rust/shift_or_euc_c/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2018 Mozilla Foundation + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/third_party/rust/shift_or_euc_c/README.md b/third_party/rust/shift_or_euc_c/README.md new file mode 100644 index 000000000000..a2e70b0f45ac --- /dev/null +++ b/third_party/rust/shift_or_euc_c/README.md @@ -0,0 +1,13 @@ +# shift_or_euc_c + +[![Apache 2 / MIT dual-licensed](https://img.shields.io/badge/license-Apache%202%20%2F%20MIT-blue.svg)](https://github.com/hsivonen/shift_or_euc_c/blob/master/COPYRIGHT) + +C API for [`shift_or_euc`](https://docs.rs/crate/shift_or_euc). + +## Documentation + +[API documentation on docs.rs](https://docs.rs/crate/shift_or_euc_c) + +## Licensing + +See the file named [COPYRIGHT](https://github.com/hsivonen/shift_or_euc_c/blob/master/COPYRIGHT). diff --git a/third_party/rust/shift_or_euc_c/include/shift_or_euc.h b/third_party/rust/shift_or_euc_c/include/shift_or_euc.h new file mode 100644 index 000000000000..3a5f4da6e518 --- /dev/null +++ b/third_party/rust/shift_or_euc_c/include/shift_or_euc.h @@ -0,0 +1,88 @@ +// Copyright 2018 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#ifndef shift_or_euc_h +#define shift_or_euc_h + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include "encoding_rs.h" + +#ifndef SHIFT_OR_EUC_DETECTOR +#define SHIFT_OR_EUC_DETECTOR Detector +#ifndef __cplusplus +typedef struct Detector_ Detector; +#endif +#endif + +/// Instantiates the detector. If `allow_2022` is `true` the possible +/// guesses are Shift_JIS, EUC-JP, ISO-2022-JP, and undecided. If +/// `allow_2022` is `false`, the possible guesses are Shift_JIS, EUC-JP, +/// and undecided. +/// +/// The instantiated detector must be freed after use using +/// `shift_or_euc_detector_free`. +SHIFT_OR_EUC_DETECTOR* shift_or_euc_detector_new(bool allow_2022); + +/// Deallocates a detector obtained from `shift_or_euc_detector_new`. +void shift_or_euc_detector_free(SHIFT_OR_EUC_DETECTOR* detector); + +/// Feeds bytes to the detector. If `last` is `true` the end of the stream +/// is considered to occur immediately after the end of `buffer`. +/// Otherwise, the stream is expected to continue. `buffer_len` may be zero. +/// `buffer` must not be `NULL` but may be undereferencable when +/// `buffer_len` is zero. +/// +/// If you're running the detector only on a prefix of a complete +/// document, _do not_ pass `last` as `true` after the prefix if the +/// stream as a whole still contains more content. +/// +/// Returns `SHIFT_JIS_ENCODING` if the detector guessed +/// Shift_JIS. Returns `EUC_JP_ENCODING` if the detector +/// guessed EUC-JP. Returns `ISO_2022_JP_ENCODING` if the +/// detector guessed ISO-2022-JP (only possible if `true` was passed as +/// `allow_2022` when instantiating the detector). Returns `NULL` if the +/// detector is undecided. If `NULL` is returned even when passing `true` +/// as `last`, falling back to Shift_JIS is the best guess for Web +/// purposes. +/// +/// Do not call again after the function has returned non-`NULL` or after +/// the function has been called with `true` as `last`. +/// +/// # Panics +/// +/// If called after the function has returned non-`NULL` or after the +/// function has been called with `true` as `last`. +/// +/// # Undefined Behavior +/// +/// UB ensues if +/// +/// * `detector` does not point to a detector obtained from +/// `shift_or_euc_detector_new` but not yet freed with +/// `shift_or_euc_detector_free`. +/// * `buffer` is `NULL`. +/// * `buffer` and `buffer_len` don't designate a range of memory +/// valid for reading. +ENCODING_RS_ENCODING const* shift_or_euc_detector_feed( + SHIFT_OR_EUC_DETECTOR* detector, + uint8_t const* buffer, + size_t buffer_len, + bool last +); + +#ifdef __cplusplus +} +#endif + +#endif // shift_or_euc_h \ No newline at end of file diff --git a/third_party/rust/shift_or_euc_c/src/lib.rs b/third_party/rust/shift_or_euc_c/src/lib.rs new file mode 100644 index 000000000000..3f168d631796 --- /dev/null +++ b/third_party/rust/shift_or_euc_c/src/lib.rs @@ -0,0 +1,94 @@ +// Copyright 2018 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![doc(html_root_url = "https://docs.rs/shift_or_euc_c/0.1.0")] + +//! C API for [`shift_or_euc`](https://docs.rs/shift_or_euc/) +//! +//! # Panics +//! +//! This crate is designed to be used only in a `panic=abort` scenario. +//! Panic propagation across FFI is not handled! +//! +//! # Licensing +//! +//! See the file named [COPYRIGHT](https://github.com/hsivonen/shift_or_euc/blob/master/COPYRIGHT). + +use encoding_rs::Encoding; +use shift_or_euc::*; + +/// Instantiates the detector. If `allow_2022` is `true` the possible +/// guesses are Shift_JIS, EUC-JP, ISO-2022-JP, and undecided. If +/// `allow_2022` is `false`, the possible guesses are Shift_JIS, EUC-JP, +/// and undecided. +/// +/// The instantiated detector must be freed after use using +/// `shift_or_euc_detector_free`. +#[no_mangle] +pub unsafe extern "C" fn shift_or_euc_detector_new(allow_2022: bool) -> *mut Detector { + Box::into_raw(Box::new(Detector::new(allow_2022))) +} + +/// Deallocates a detector obtained from `shift_or_euc_detector_new`. +#[no_mangle] +pub unsafe extern "C" fn shift_or_euc_detector_free(detector: *mut Detector) { + let _ = Box::from_raw(detector); +} + +/// Feeds bytes to the detector. If `last` is `true` the end of the stream +/// is considered to occur immediately after the end of `buffer`. +/// Otherwise, the stream is expected to continue. `buffer_len` may be zero. +/// `buffer` must not be `NULL` but may be undereferencable when +/// `buffer_len` is zero. +/// +/// If you're running the detector only on a prefix of a complete +/// document, _do not_ pass `last` as `true` after the prefix if the +/// stream as a whole still contains more content. +/// +/// Returns `SHIFT_JIS_ENCODING` if the detector guessed +/// Shift_JIS. Returns `EUC_JP_ENCODING` if the detector +/// guessed EUC-JP. Returns `ISO_2022_JP_ENCODING` if the +/// detector guessed ISO-2022-JP (only possible if `true` was passed as +/// `allow_2022` when instantiating the detector). Returns `NULL` if the +/// detector is undecided. If `NULL` is returned even when passing `true` +/// as `last`, falling back to Shift_JIS is the best guess for Web +/// purposes. +/// +/// Do not call again after the function has returned non-`NULL` or after +/// the function has been called with `true` as `last`. +/// +/// # Panics +/// +/// If called after the function has returned non-`NULL` or after the +/// function has been called with `true` as `last`. +/// +/// # Undefined Behavior +/// +/// UB ensues if +/// +/// * `detector` does not point to a detector obtained from +/// `shift_or_euc_detector_new` but not yet freed with +/// `shift_or_euc_detector_free`. +/// * `buffer` is `NULL`. +/// * `buffer` and `buffer_len` don't designate a range of memory +/// valid for reading. +#[no_mangle] +pub unsafe extern "C" fn shift_or_euc_detector_feed( + detector: *mut Detector, + buffer: *const u8, + buffer_len: usize, + last: bool, +) -> *const Encoding { + if let Some(encoding) = (*detector).feed(::std::slice::from_raw_parts(buffer, buffer_len), last) + { + encoding + } else { + ::std::ptr::null() + } +} diff --git a/toolkit/actors/ViewSourceChild.jsm b/toolkit/actors/ViewSourceChild.jsm index d5f27d11c5c8..471d5929b35d 100644 --- a/toolkit/actors/ViewSourceChild.jsm +++ b/toolkit/actors/ViewSourceChild.jsm @@ -52,19 +52,20 @@ class ViewSourceChild extends JSWindowActorChild { * loading. */ viewSource(URL, outerWindowID, lineNumber) { - let otherDocShell; - let forceEncodingDetection = false; + let otherDocShell, forcedCharSet; if (outerWindowID) { let contentWindow = Services.wm.getOuterWindowWithId(outerWindowID); if (contentWindow) { otherDocShell = contentWindow.docShell; - forceEncodingDetection = contentWindow.windowUtils.docCharsetIsForced; + let utils = contentWindow.windowUtils; + let doc = contentWindow.document; + forcedCharSet = utils.docCharsetIsForced ? doc.characterSet : null; } } - this.loadSource(URL, otherDocShell, lineNumber, forceEncodingDetection); + this.loadSource(URL, otherDocShell, lineNumber, forcedCharSet); } /** @@ -104,14 +105,18 @@ class ViewSourceChild extends JSWindowActorChild { * @param lineNumber (optional) * The line number to focus as soon as the source has finished * loading. - * @param forceEncodingDetection (optional) - * Force autodetection of the character encoding. + * @param forcedCharSet (optional) + * The document character set to use instead of the default one. */ - loadSource(URL, otherDocShell, lineNumber, forceEncodingDetection) { + loadSource(URL, otherDocShell, lineNumber, forcedCharSet) { const viewSrcURL = "view-source:" + URL; - if (forceEncodingDetection) { - this.docShell.forceEncodingDetection(); + if (forcedCharSet) { + try { + this.docShell.charset = forcedCharSet; + } catch (e) { + /* invalid charset */ + } } ViewSourcePageChild.setInitialLineNumber(lineNumber); diff --git a/toolkit/content/widgets/browser-custom-element.js b/toolkit/content/widgets/browser-custom-element.js index 440f0f7b2301..778cadd6b810 100644 --- a/toolkit/content/widgets/browser-custom-element.js +++ b/toolkit/content/widgets/browser-custom-element.js @@ -255,6 +255,8 @@ this._mayEnableCharacterEncodingMenu = null; + this._charsetAutodetected = false; + this._contentPrincipal = null; this._contentPartitionedPrincipal = null; @@ -583,11 +585,17 @@ : this.contentDocument.title; } - forceEncodingDetection() { + set characterSet(val) { if (this.isRemoteBrowser) { - this.sendMessageToActor("ForceEncodingDetection", {}, "BrowserTab"); + this.sendMessageToActor( + "UpdateCharacterSet", + { value: val }, + "BrowserTab" + ); + this._characterSet = val; } else { - this.docShell.forceEncodingDetection(); + this.docShell.charset = val; + this.docShell.gatherCharsetMenuTelemetry(); } } @@ -607,6 +615,18 @@ } } + get charsetAutodetected() { + return this.isRemoteBrowser + ? this._charsetAutodetected + : this.docShell.charsetAutodetected; + } + + set charsetAutodetected(aAutodetected) { + if (this.isRemoteBrowser) { + this._charsetAutodetected = aAutodetected; + } + } + get contentPrincipal() { return this.isRemoteBrowser ? this._contentPrincipal @@ -1127,6 +1147,7 @@ aLocation, aCharset, aMayEnableCharacterEncodingMenu, + aCharsetAutodetected, aDocumentURI, aTitle, aContentPrincipal, @@ -1142,6 +1163,7 @@ if (aCharset != null) { this._characterSet = aCharset; this._mayEnableCharacterEncodingMenu = aMayEnableCharacterEncodingMenu; + this._charsetAutodetected = aCharsetAutodetected; } if (aContentType != null) { @@ -1556,6 +1578,7 @@ "_documentContentType", "_characterSet", "_mayEnableCharacterEncodingMenu", + "_charsetAutodetected", "_contentPrincipal", "_contentPartitionedPrincipal", "_isSyntheticDocument", diff --git a/toolkit/library/rust/shared/Cargo.toml b/toolkit/library/rust/shared/Cargo.toml index a2a4ce13c594..a981e38bba12 100644 --- a/toolkit/library/rust/shared/Cargo.toml +++ b/toolkit/library/rust/shared/Cargo.toml @@ -39,6 +39,7 @@ cert_storage = { path = "../../../../security/manager/ssl/cert_storage" } bitsdownload = { path = "../../../components/bitsdownload", optional = true } storage = { path = "../../../../storage/rust" } bookmark_sync = { path = "../../../components/places/bookmark_sync", optional = true } +shift_or_euc_c = "0.1.0" chardetng_c = "0.1.1" audio_thread_priority = "0.23.4" mdns_service = { path="../../../../dom/media/webrtc/transport/mdns_service", optional = true } diff --git a/toolkit/library/rust/shared/lib.rs b/toolkit/library/rust/shared/lib.rs index 12663cb62ea8..74f7f005251d 100644 --- a/toolkit/library/rust/shared/lib.rs +++ b/toolkit/library/rust/shared/lib.rs @@ -45,6 +45,7 @@ extern crate processtools; #[cfg(feature = "gecko_profiler")] extern crate profiler_helper; extern crate rsdparsa_capi; +extern crate shift_or_euc_c; extern crate static_prefs; extern crate storage; #[cfg(feature = "quantum_render")] diff --git a/toolkit/locales/en-US/chrome/global/charsetMenu.dtd b/toolkit/locales/en-US/chrome/global/charsetMenu.dtd new file mode 100644 index 000000000000..4c235ac0eac0 --- /dev/null +++ b/toolkit/locales/en-US/chrome/global/charsetMenu.dtd @@ -0,0 +1,5 @@ + + + diff --git a/toolkit/locales/en-US/chrome/global/charsetMenu.properties b/toolkit/locales/en-US/chrome/global/charsetMenu.properties new file mode 100644 index 000000000000..9876dc5a267e --- /dev/null +++ b/toolkit/locales/en-US/chrome/global/charsetMenu.properties @@ -0,0 +1,114 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# LOCALIZATION NOTE: The property keys ending with ".key" are for access keys. +# Localizations may add or delete properties where the property key ends with +# ".key" as appropriate for the localization. The code that uses this data can +# deal with the absence of an access key for an item. +# +# For gbk, gbk.bis and gbk.bis.key are used to trigger string changes in +# localizations. +# +# In the en-US version of this file, access keys are given to the following: +# * UTF-8 +# * All encodings that are the fallback encoding for some locale in Firefox +# * All encodings that are the fallback encoding for some locale in IE +# * All Japanese encodings +# +# For the items whose property key does not end in ".key" and whose value +# includes "(" U+0028 LEFT PARENTHESIS, the "(" character is significant for +# processing by CharsetMenu.jsm. If your localization does not use ASCII +# parentheses where en-US does in this file, please file a bug to make +# CharsetMenu.jsm also recognize the delimiter your localization uses. +# (When this code was developed, all localizations appeared to use +# U+0028 LEFT PARENTHESIS for this purpose.) + +# Globally-relevant + +_autodetect_all.key = m +_autodetect_all = Automatic +UTF-8.key = U +UTF-8 = Unicode +windows-1252.key = W +windows-1252 = Western + +# Arabic +windows-1256.key = A +windows-1256 = Arabic (Windows) +ISO-8859-6 = Arabic (ISO) + +# Baltic +windows-1257.key = B +windows-1257 = Baltic (Windows) +ISO-8859-4 = Baltic (ISO) + +# Central European +windows-1250.key = E +windows-1250 = Central European (Windows) +ISO-8859-2.key = l +ISO-8859-2 = Central European (ISO) + +# Chinese, Simplified +gbk.bis.key = S +gbk.bis = Chinese, Simplified + +# Chinese, Traditional +Big5.key = T +Big5 = Chinese, Traditional + +# Cyrillic +windows-1251.key = C +windows-1251 = Cyrillic (Windows) +ISO-8859-5 = Cyrillic (ISO) +KOI8-R = Cyrillic (KOI8-R) +KOI8-U = Cyrillic (KOI8-U) +IBM866 = Cyrillic (DOS) + +# UI string in anticipation of Cyrillic analog of bug 1543077; +# deliberately not in use yet + +# LOCALIZATION NOTE (Cyrillic.key): If taken into use, this string will appear +# instead of the string for windows-1251.key, so the use of the same +# accelerator is deliberate. +Cyrillic.key = C +# LOCALIZATION NOTE (Cyrillic): If taken into use, this string will appear +# as a single item instead of the five items windows-1251, ISO-8859-5, +# KOI8-R, KOI8-U, and IBM866, so this string does not need to make sense +# together with those strings and should be translated the way those were +# but omitting the part in parentheses. +Cyrillic = Cyrillic + +# Greek +windows-1253.key = G +windows-1253 = Greek (Windows) +ISO-8859-7.key = O +ISO-8859-7 = Greek (ISO) + +# Hebrew +windows-1255.key = H +windows-1255 = Hebrew +# LOCALIZATION NOTE (ISO-8859-8): The value for this item should begin with +# the same word for Hebrew as the value for windows-1255 so that this item +# sorts right after that one in the collation order for your locale. +ISO-8859-8 = Hebrew, Visual + +# Japanese (NOT AN ENCODING NAME) +Japanese.key = J +Japanese = Japanese + +# Korean +EUC-KR.key = K +EUC-KR = Korean + +# Thai +windows-874.key = i +windows-874 = Thai + +# Turkish +windows-1254.key = r +windows-1254 = Turkish + +# Vietnamese +windows-1258.key = V +windows-1258 = Vietnamese diff --git a/toolkit/locales/jar.mn b/toolkit/locales/jar.mn index 2cb394e095c8..abe3bbc7dea1 100644 --- a/toolkit/locales/jar.mn +++ b/toolkit/locales/jar.mn @@ -14,6 +14,8 @@ locale/@AB_CD@/global/autocomplete.properties (%chrome/global/autocomplete.properties) locale/@AB_CD@/global/appPicker.dtd (%chrome/global/appPicker.dtd) locale/@AB_CD@/global/browser.properties (%chrome/global/browser.properties) + locale/@AB_CD@/global/charsetMenu.dtd (%chrome/global/charsetMenu.dtd) + locale/@AB_CD@/global/charsetMenu.properties (%chrome/global/charsetMenu.properties) locale/@AB_CD@/global/commonDialog.dtd (%chrome/global/commonDialog.dtd) locale/@AB_CD@/global/commonDialogs.properties (%chrome/global/commonDialogs.properties) locale/@AB_CD@/global/contentAreaCommands.properties (%chrome/global/contentAreaCommands.properties) diff --git a/toolkit/modules/CharsetMenu.jsm b/toolkit/modules/CharsetMenu.jsm new file mode 100644 index 000000000000..2a3335dabd2e --- /dev/null +++ b/toolkit/modules/CharsetMenu.jsm @@ -0,0 +1,223 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +var EXPORTED_SYMBOLS = ["CharsetMenu"]; + +const { Services } = ChromeUtils.import("resource://gre/modules/Services.jsm"); +const { XPCOMUtils } = ChromeUtils.import( + "resource://gre/modules/XPCOMUtils.jsm" +); +XPCOMUtils.defineLazyGetter(this, "gBundle", function() { + const kUrl = "chrome://global/locale/charsetMenu.properties"; + return Services.strings.createBundle(kUrl); +}); + +ChromeUtils.defineModuleGetter( + this, + "Deprecated", + "resource://gre/modules/Deprecated.jsm" +); + +/** + * This set contains encodings that are in the Encoding Standard, except: + * - Japanese encodings are represented by one autodetection item + * - x-user-defined, which practically never makes sense as an end-user-chosen + * override. + * - Encodings that IE11 doesn't have in its corresponding menu. + */ +const kEncodings = new Set([ + // Globally relevant + "_autodetect_all", // (NOT AN ENCODING NAME; using IE-consistent magic name) + "UTF-8", + "windows-1252", + // Arabic + "windows-1256", + "ISO-8859-6", + // Baltic + "windows-1257", + "ISO-8859-4", + // "ISO-8859-13", // Hidden since not in menu in IE11 + // Central European + "windows-1250", + "ISO-8859-2", + // Chinese, Simplified + "GBK", + // Chinese, Traditional + "Big5", + // Cyrillic + "windows-1251", + "ISO-8859-5", + "KOI8-R", + "KOI8-U", + "IBM866", // Not in menu in Chromium. Maybe drop this? + // "x-mac-cyrillic", // Not in menu in IE11 or Chromium. + // Greek + "windows-1253", + "ISO-8859-7", + // Hebrew + "windows-1255", + "ISO-8859-8", + // Japanese (NOT AN ENCODING NAME) + "Japanese", + // Korean + "EUC-KR", + // Thai + "windows-874", + // Turkish + "windows-1254", + // Vietnamese + "windows-1258", + // Hiding rare European encodings that aren't in the menu in IE11 and would + // make the menu messy by sorting all over the place + // "ISO-8859-3", + // "ISO-8859-10", + // "ISO-8859-14", + // "ISO-8859-15", + // "ISO-8859-16", + // "macintosh" +]); + +// Always at the start of the menu, in this order, followed by a separator. +const kPinned = ["_autodetect_all", "UTF-8", "windows-1252"]; + +kPinned.forEach(x => kEncodings.delete(x)); + +function CharsetComparator(a, b) { + // Normal sorting sorts the part in parenthesis in an order that + // happens to make the less frequently-used items first. + let titleA = a.label.replace(/\(.*/, "") + b.value; + let titleB = b.label.replace(/\(.*/, "") + a.value; + // Secondarily reverse sort by encoding name to sort "windows" + return titleA.localeCompare(titleB) || b.value.localeCompare(a.value); +} + +var gCharsetInfoCache, gPinnedInfoCache; + +var CharsetMenu = { + build(parent, deprecatedShowAccessKeys = true) { + if (!deprecatedShowAccessKeys) { + Deprecated.warning( + "CharsetMenu no longer supports building a menu with no access keys.", + "https://bugzilla.mozilla.org/show_bug.cgi?id=1088710" + ); + } + function createDOMNode(doc, nodeInfo) { + let node = doc.createXULElement("menuitem"); + node.setAttribute("type", "radio"); + node.setAttribute("name", nodeInfo.name + "Group"); + node.setAttribute(nodeInfo.name, nodeInfo.value); + node.setAttribute("label", nodeInfo.label); + if (nodeInfo.accesskey) { + node.setAttribute("accesskey", nodeInfo.accesskey); + } + return node; + } + + if (parent.hasChildNodes()) { + // Charset menu already built + return; + } + this._ensureDataReady(); + let doc = parent.ownerDocument; + + gPinnedInfoCache.forEach(charsetInfo => + parent.appendChild(createDOMNode(doc, charsetInfo)) + ); + parent.appendChild(doc.createXULElement("menuseparator")); + gCharsetInfoCache.forEach(charsetInfo => + parent.appendChild(createDOMNode(doc, charsetInfo)) + ); + }, + + getData() { + this._ensureDataReady(); + return { + pinnedCharsets: gPinnedInfoCache, + otherCharsets: gCharsetInfoCache, + }; + }, + + _ensureDataReady() { + if (!gCharsetInfoCache) { + gPinnedInfoCache = this.getCharsetInfo(kPinned, false); + gCharsetInfoCache = this.getCharsetInfo(kEncodings); + } + }, + + getCharsetInfo(charsets, sort = true) { + let list = Array.from(charsets, charset => ({ + label: this._getCharsetLabel(charset), + accesskey: this._getCharsetAccessKey(charset), + name: "charset", + value: charset, + })); + + if (sort) { + list.sort(CharsetComparator); + } + return list; + }, + + _getCharsetLabel(charset) { + if (charset == "GBK") { + // Localization key has been revised + charset = "gbk.bis"; + } + try { + return gBundle.GetStringFromName(charset); + } catch (ex) {} + return charset; + }, + _getCharsetAccessKey(charset) { + if (charset == "GBK") { + // Localization key has been revised + charset = "gbk.bis"; + } + try { + return gBundle.GetStringFromName(charset + ".key"); + } catch (ex) {} + return ""; + }, + + /** + * For substantially similar encodings, treat two encodings as the same + * for the purpose of the check mark. + */ + foldCharset(charset, isAutodetected) { + if (isAutodetected) { + switch (charset) { + case "Shift_JIS": + case "EUC-JP": + case "ISO-2022-JP": + return "Japanese"; + default: + // fall through + } + } + switch (charset) { + case "ISO-8859-8-I": + return "windows-1255"; + + case "gb18030": + return "GBK"; + + default: + return charset; + } + }, + + /** + * This method is for comm-central callers only. + */ + update(parent, charset) { + let menuitem = parent + .getElementsByAttribute("charset", this.foldCharset(charset, false)) + .item(0); + if (menuitem) { + menuitem.setAttribute("checked", "true"); + } + }, +}; + +Object.freeze(CharsetMenu); diff --git a/toolkit/modules/moz.build b/toolkit/modules/moz.build index 8ac56c81e646..faf9673c33e0 100644 --- a/toolkit/modules/moz.build +++ b/toolkit/modules/moz.build @@ -48,6 +48,9 @@ with Files("tests/xpcshell/test_UpdateUtils*.js"): with Files("AsyncPrefs.jsm"): BUG_COMPONENT = ("Core", "Security: Process Sandboxing") +with Files("CharsetMenu.jsm"): + BUG_COMPONENT = ("Firefox", "Toolbars and Customization") + with Files("Color.jsm"): BUG_COMPONENT = ("Toolkit", "Find Toolbar") @@ -157,6 +160,7 @@ EXTRA_JS_MODULES += [ "BrowserUtils.jsm", "CanonicalJSON.jsm", "CertUtils.jsm", + "CharsetMenu.jsm", "Color.jsm", "Console.jsm", "ContentDOMReference.jsm",