Bug 1482095 - Update encoding_rs to 0.8.6. r=emk.

MozReview-Commit-ID: IqPrrQ7L1lU
2018-08-09 15:35:34 +03:00 · 2018-08-09 15:35:34 +03:00 · c82d099240
--- a/Cargo.lock
+++ b/Cargo.lock
@ -634,21 +634,21 @@ name = "encoding_c"
 version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
- "encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
+ "encoding_rs 0.8.6 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

 [[package]]
 name = "encoding_glue"
 version = "0.1.0"
 dependencies = [
- "encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
+ "encoding_rs 0.8.6 (registry+https://github.com/rust-lang/crates.io-index)",
 "nserror 0.1.0",
 "nsstring 0.1.0",
 ]

 [[package]]
 name = "encoding_rs"
-version = "0.8.4"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
 "cfg-if 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
@ -1458,7 +1458,7 @@ name = "nsstring"
 version = "0.1.0"
 dependencies = [
 "bitflags 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
- "encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
+ "encoding_rs 0.8.6 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

 [[package]]
@ -2684,7 +2684,7 @@ dependencies = [
 "checksum either 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "18785c1ba806c258137c937e44ada9ee7e69a37e3c72077542cd2f069d78562a"
 "checksum ena 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "cabe5a5078ac8c506d3e4430763b1ba9b609b1286913e7d08e581d1c2de9b7e5"
 "checksum encoding_c 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "769ecb8b33323998e482b218c0d13cd64c267609023b4b7ec3ee740714c318ee"
-"checksum encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)" = "88a1b66a0d28af4b03a8c8278c6dcb90e6e600d89c14500a9e7a02e64b9ee3ac"
+"checksum encoding_rs 0.8.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2a91912d6f37c6a8fef8a2316a862542d036f13c923ad518b5aca7bcaac7544c"
 "checksum env_logger 0.5.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0561146661ae44c579e993456bc76d11ce1e0c7d745e57b2fa7146b6e49fa2ad"
 "checksum error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ff511d5dc435d703f4971bc399647c9bc38e20cb41452e3b9feb4765419ed3f3"
 "checksum euclid 0.19.0 (registry+https://github.com/rust-lang/crates.io-index)" = "70a2ebdf55fb9d6329046e026329a55ef8fbaae5ea833f56e170beb3125a8a5f"
--- a/third_party/rust/encoding_rs/.cargo-checksum.json
+++ b/third_party/rust/encoding_rs/.cargo-checksum.json
--- a/third_party/rust/encoding_rs/Cargo.toml
+++ b/third_party/rust/encoding_rs/Cargo.toml
@ -12,7 +12,7 @@

 [package]
 name = "encoding_rs"
-version = "0.8.4"
+version = "0.8.6"
 authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
 description = "A Gecko-oriented implementation of the Encoding Standard"
 homepage = "https://docs.rs/encoding_rs/"
--- a/third_party/rust/encoding_rs/README.md
+++ b/third_party/rust/encoding_rs/README.md
@ -68,6 +68,13 @@ Additionally, `encoding_rs::mem` does the following:
 * Converts ASCII to UTF-16 up to the first non-ASCII byte.
 * Converts UTF-16 to ASCII up to the first non-Basic Latin code unit.

+## Integration with `std::io`
+
+Notably, the above feature list doesn't include the capability to wrap
+a `std::io::Read`, decode it into UTF-8 and presenting the result via
+`std::io::Read`. The [`encoding_rs_io`](https://crates.io/crates/encoding_rs_io)
+crate provides that capability.
+
 ## Licensing

 Please see the file named
@ -237,6 +244,22 @@ used in Firefox.

 ## Release Notes

+### 0.8.6
+
+* Temporarily removed the debug assertion added in version 0.8.5 from
+  `convert_utf16_to_latin1_lossy`.
+
+### 0.8.5
+
+* If debug assertions are enabled but fuzzing isn't enabled, lossy conversions
+  to Latin1 in the `mem` module assert that the input is in the range
+  U+0000...U+00FF (inclusive).
+* In the `mem` module provide conversions from Latin1 and UTF-16 to UTF-8
+  that can deal with insufficient output space. The idea is to use them
+  first with an allocation rounded up to jemalloc bucket size and do the
+  worst-case allocation only if the jemalloc rounding up was insufficient
+  as the first guess.
+
 ### 0.8.4

 * Fix SSE2-specific, `simd-accel`-specific memory corruption introduced in
--- a/third_party/rust/encoding_rs/src/lib.rs
+++ b/third_party/rust/encoding_rs/src/lib.rs
@ -8,7 +8,7 @@
 // except according to those terms.

 #![cfg_attr(feature = "cargo-clippy", allow(doc_markdown, inline_always, new_ret_no_self))]
-#![doc(html_root_url = "https://docs.rs/encoding_rs/0.8.4")]
+#![doc(html_root_url = "https://docs.rs/encoding_rs/0.8.6")]

 //! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
 //! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
@ -36,6 +36,13 @@
 //! The [repository is on GitHub](https://github.com/hsivonen/encoding_rs). The
 //! [crate is available on crates.io](https://crates.io/crates/encoding_rs).
 //!
+//! # Integration with `std::io`
+//!
+//! This crate doesn't implement traits from `std::io`. However, for the case of
+//! wrapping a `std::io::Read` in a decoder that implements `std::io::Read` and
+//! presents the data from the wrapped `std::io::Read` as UTF-8 is addressed by
+//! the [`encoding_rs_io`](https://docs.rs/encoding_rs_io/) crate.
+//!
 //! # Examples
 //!
 //! Example programs:
--- a/third_party/rust/encoding_rs/src/mem.rs
+++ b/third_party/rust/encoding_rs/src/mem.rs
@ -31,6 +31,10 @@ use super::EncoderResult;
 use ascii::*;
 use utf_8::*;

+macro_rules! non_fuzz_debug_assert {
+    ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); })
+}
+
 cfg_if!{
    if #[cfg(feature = "simd-accel")] {
        use ::std::intrinsics::unlikely;
@ -1547,6 +1551,33 @@ pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
    }
 }

+/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
+/// with the REPLACEMENT CHARACTER with potentially insufficient output
+/// space.
+///
+/// Returns the number of code units read and the number of bytes written.
+///
+/// Not all code units are read if there isn't enough output space.
+///
+/// Note  that this method isn't designed for general streamability but for
+/// not allocating memory for the worst case up front. Specifically,
+/// if the input starts with or ends with an unpaired surrogate, those are
+/// replaced with the REPLACEMENT CHARACTER.
+///
+/// # Safety
+///
+/// Note that this function may write garbage beyond the number of bytes
+/// indicated by the return value, so using a `&mut str` interpreted as
+/// `&mut [u8]` as the destination is not safe. If you want to convert into
+/// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
+#[inline]
+pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
+    let mut encoder = Utf8Encoder;
+    let (result, read, written) = encoder.encode_from_utf16_raw(src, dst, true);
+    debug_assert!(result == EncoderResult::OutputFull || read == src.len());
+    (read, written)
+}
+
 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
 /// with the REPLACEMENT CHARACTER.
 ///
@ -1568,12 +1599,42 @@ pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
 #[inline]
 pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
    assert!(dst.len() >= src.len() * 3 + 1);
-    let mut encoder = Utf8Encoder;
-    let (result, _, written) = encoder.encode_from_utf16_raw(src, dst, true);
-    debug_assert!(result == EncoderResult::InputEmpty);
+    let (read, written) = convert_utf16_to_utf8_partial(src, dst);
+    debug_assert_eq!(read, src.len());
    written
 }

+/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
+/// with the REPLACEMENT CHARACTER such that the validity of the output is
+/// signaled using the Rust type system with potentially insufficient output
+/// space.
+///
+/// Returns the number of code units read and the number of bytes written.
+///
+/// Not all code units are read if there isn't enough output space.
+///
+/// Note  that this method isn't designed for general streamability but for
+/// not allocating memory for the worst case up front. Specifically,
+/// if the input starts with or ends with an unpaired surrogate, those are
+/// replaced with the REPLACEMENT CHARACTER.
+#[inline]
+pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) {
+    let bytes: &mut [u8] = unsafe { ::std::mem::transmute(dst) };
+    let (read, written) = convert_utf16_to_utf8_partial(src, bytes);
+    let len = bytes.len();
+    let mut trail = written;
+    let max = ::std::cmp::min(len, trail + MAX_STRIDE_SIZE);
+    while trail < max {
+        bytes[trail] = 0;
+        trail += 1;
+    }
+    while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
+        bytes[trail] = 0;
+        trail += 1;
+    }
+    (read, written)
+}
+
 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
 /// with the REPLACEMENT CHARACTER such that the validity of the output is
 /// signaled using the Rust type system.
@ -1588,19 +1649,9 @@ pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
 /// Panics if the destination buffer is shorter than stated above.
 #[inline]
 pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize {
-    let bytes: &mut [u8] = unsafe { ::std::mem::transmute(dst) };
-    let written = convert_utf16_to_utf8(src, bytes);
-    let len = bytes.len();
-    let mut trail = written;
-    let max = ::std::cmp::min(len, trail + MAX_STRIDE_SIZE);
-    while trail < max {
-        bytes[trail] = 0;
-        trail += 1;
-    }
-    while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
-        bytes[trail] = 0;
-        trail += 1;
-    }
+    assert!(dst.len() >= src.len() * 3 + 1);
+    let (read, written) = convert_utf16_to_str_partial(src, dst);
+    debug_assert_eq!(read, src.len());
    written
 }

@ -1629,6 +1680,59 @@ pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) {
    }
 }

+/// Converts bytes whose unsigned value is interpreted as Unicode code point
+/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
+/// output space.
+///
+/// Returns the number of bytes read and the number of bytes written.
+///
+/// If the output isn't large enough, not all input is consumed.
+///
+/// # Safety
+///
+/// Note that this function may write garbage beyond the number of bytes
+/// indicated by the return value, so using a `&mut str` interpreted as
+/// `&mut [u8]` as the destination is not safe. If you want to convert into
+/// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
+#[inline]
+pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) {
+    let src_len = src.len();
+    let src_ptr = src.as_ptr();
+    let dst_ptr = dst.as_mut_ptr();
+    let dst_len = dst.len();
+    let mut total_read = 0usize;
+    let mut total_written = 0usize;
+    loop {
+        // src can't advance more than dst
+        let src_left = src_len - total_read;
+        let dst_left = dst_len - total_written;
+        let min_left = ::std::cmp::min(src_left, dst_left);
+        if let Some((non_ascii, consumed)) = unsafe {
+            ascii_to_ascii(
+                src_ptr.offset(total_read as isize),
+                dst_ptr.offset(total_written as isize),
+                min_left,
+            )
+        } {
+            total_read += consumed;
+            total_written += consumed;
+            if total_written.checked_add(2).unwrap() > dst_len {
+                return (total_read, total_written);
+            }
+
+            total_read += 1; // consume `non_ascii`
+
+            let code_point = non_ascii as u32;
+            dst[total_written] = ((code_point >> 6) | 0xC0u32) as u8;
+            total_written += 1;
+            dst[total_written] = ((code_point as u32 & 0x3Fu32) | 0x80u32) as u8;
+            total_written += 1;
+            continue;
+        }
+        return (total_read + min_left, total_written + min_left);
+    }
+}
+
 /// Converts bytes whose unsigned value is interpreted as Unicode code point
 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
 ///
@ -1653,33 +1757,35 @@ pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
        dst.len() >= src.len() * 2,
        "Destination must not be shorter than the source times two."
    );
-    let src_len = src.len();
-    let src_ptr = src.as_ptr();
-    let dst_ptr = dst.as_mut_ptr();
-    let mut total_read = 0usize;
-    let mut total_written = 0usize;
-    loop {
-        // src can't advance more than dst
-        let src_left = src_len - total_read;
-        if let Some((non_ascii, consumed)) = unsafe {
-            ascii_to_ascii(
-                src_ptr.offset(total_read as isize),
-                dst_ptr.offset(total_written as isize),
-                src_left,
-            )
-        } {
-            total_read += consumed + 1;
-            total_written += consumed;
+    let (read, written) = convert_latin1_to_utf8_partial(src, dst);
+    debug_assert_eq!(read, src.len());
+    written
+}

-            let code_point = non_ascii as u32;
-            dst[total_written] = ((code_point >> 6) | 0xC0u32) as u8;
-            total_written += 1;
-            dst[total_written] = ((code_point as u32 & 0x3Fu32) | 0x80u32) as u8;
-            total_written += 1;
-            continue;
-        }
-        return total_written + src_left;
+/// Converts bytes whose unsigned value is interpreted as Unicode code point
+/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
+/// output is signaled using the Rust type system with potentially insufficient
+/// output space.
+///
+/// Returns the number of bytes read and the number of bytes written.
+///
+/// If the output isn't large enough, not all input is consumed.
+#[inline]
+pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) {
+    let bytes: &mut [u8] = unsafe { ::std::mem::transmute(dst) };
+    let (read, written) = convert_latin1_to_utf8_partial(src, bytes);
+    let len = bytes.len();
+    let mut trail = written;
+    let max = ::std::cmp::min(len, trail + MAX_STRIDE_SIZE);
+    while trail < max {
+        bytes[trail] = 0;
+        trail += 1;
    }
+    while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
+        bytes[trail] = 0;
+        trail += 1;
+    }
+    (read, written)
 }

 /// Converts bytes whose unsigned value is interpreted as Unicode code point
@ -1696,19 +1802,12 @@ pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
 /// Panics if the destination buffer is shorter than stated above.
 #[inline]
 pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
-    let bytes: &mut [u8] = unsafe { ::std::mem::transmute(dst) };
-    let written = convert_latin1_to_utf8(src, bytes);
-    let len = bytes.len();
-    let mut trail = written;
-    let max = ::std::cmp::min(len, trail + MAX_STRIDE_SIZE);
-    while trail < max {
-        bytes[trail] = 0;
-        trail += 1;
-    }
-    while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
-        bytes[trail] = 0;
-        trail += 1;
-    }
+    assert!(
+        dst.len() >= src.len() * 2,
+        "Destination must not be shorter than the source times two."
+    );
+    let (read, written) = convert_latin1_to_str_partial(src, dst);
+    debug_assert_eq!(read, src.len());
    written
 }

@ -1718,6 +1817,7 @@ pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
 /// each output byte.
 ///
 /// If the input does not fulfill the condition stated above, this function
+/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
 /// does something that is memory-safe without any promises about any
 /// properties of the output. In particular, callers shouldn't assume the
 /// output to be the same across crate versions or CPU architectures and
@ -1731,12 +1831,16 @@ pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
 /// # Panics
 ///
 /// Panics if the destination buffer is shorter than stated above.
+///
+/// If debug assertions are enabled (and not fuzzing) and the input is
+/// not in the range U+0000 to U+00FF, inclusive.
 #[inline]
 pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
    assert!(
        dst.len() >= src.len(),
        "Destination must not be shorter than the source."
    );
+    non_fuzz_debug_assert!(is_utf8_latin1(src));
    let src_len = src.len();
    let src_ptr = src.as_ptr();
    let dst_ptr = dst.as_mut_ptr();
@ -1776,11 +1880,12 @@ pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
 /// represents the value of each code point as the unsigned byte value of
 /// each output byte.
 ///
-/// If the input does not fulfill the condition stated above, this function
-/// does something that is memory-safe without any promises about any
-/// properties of the output. In particular, callers shouldn't assume the
-/// output to be the same across crate versions or CPU architectures and
-/// should not assume that non-Basic Latin input can't map to ASCII output.
+/// If the input does not fulfill the condition stated above, does something
+/// that is memory-safe without any promises about any properties of the
+/// output and will probably assert in debug builds in future versions.
+/// In particular, callers shouldn't assume the output to be the same across
+/// crate versions or CPU architectures and should not assume that non-ASCII
+/// input can't map to ASCII output.
 ///
 /// The length of the destination buffer must be at least the length of the
 /// source buffer.
@ -1790,12 +1895,16 @@ pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
 /// # Panics
 ///
 /// Panics if the destination buffer is shorter than stated above.
+///
+/// (Probably in future versions if debug assertions are enabled (and not
+/// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
 #[inline]
 pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) {
    assert!(
        dst.len() >= src.len(),
        "Destination must not be shorter than the source."
    );
+    // non_fuzz_debug_assert!(is_utf16_latin1(src));
    unsafe {
        pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
    }
@ -2100,6 +2209,18 @@ mod tests {
        assert_eq!(dst, reference);
    }

+    #[test]
+    fn test_convert_utf16_to_utf8_partial() {
+        let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
+        let src: Vec<u16> = reference.encode_utf16().collect();
+        let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
+        dst.resize(src.len() * 3 + 1, 0);
+        let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]);
+        let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]);
+        dst.truncate(len);
+        assert_eq!(dst, reference.as_bytes());
+    }
+
    #[test]
    fn test_convert_utf16_to_utf8() {
        let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
@ -2127,6 +2248,14 @@ mod tests {
        assert_eq!(dst, reference);
    }

+    #[test]
+    fn test_convert_latin1_to_utf8_partial() {
+        let mut dst = [0u8, 2];
+        let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]);
+        assert_eq!(read, 1);
+        assert_eq!(written, 1);
+    }
+
    #[test]
    fn test_convert_latin1_to_utf8() {
        let mut src: Vec<u8> = Vec::with_capacity(256);
@ -2163,6 +2292,13 @@ mod tests {
        assert_eq!(dst, reference);
    }

+    #[test]
+    #[should_panic]
+    fn test_convert_utf8_to_latin1_lossy_panics() {
+        let mut dst = [0u8; 16];
+        let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]);
+    }
+
    #[test]
    fn test_convert_utf16_to_latin1_lossy() {
        let mut src: Vec<u16> = Vec::with_capacity(256);
@ -2179,6 +2315,13 @@ mod tests {
        assert_eq!(dst, reference);
    }

+    #[test]
+    // #[should_panic]
+    fn test_convert_utf16_to_latin1_lossy_panics() {
+        let mut dst = [0u8; 16];
+        let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]);
+    }
+
    #[test]
    fn test_utf16_valid_up_to() {
        let valid = vec![