Bug 1482095 - Update encoding_rs to 0.8.6. r=emk.

MozReview-Commit-ID: IqPrrQ7L1lU
This commit is contained in:
Henri Sivonen 2018-08-09 15:35:34 +03:00
Родитель 726074f95d
Коммит c82d099240
6 изменённых файлов: 240 добавлений и 67 удалений

10
Cargo.lock сгенерированный
Просмотреть файл

@ -634,21 +634,21 @@ name = "encoding_c"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "encoding_glue"
version = "0.1.0"
dependencies = [
"encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.6 (registry+https://github.com/rust-lang/crates.io-index)",
"nserror 0.1.0",
"nsstring 0.1.0",
]
[[package]]
name = "encoding_rs"
version = "0.8.4"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
@ -1458,7 +1458,7 @@ name = "nsstring"
version = "0.1.0"
dependencies = [
"bitflags 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@ -2684,7 +2684,7 @@ dependencies = [
"checksum either 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "18785c1ba806c258137c937e44ada9ee7e69a37e3c72077542cd2f069d78562a"
"checksum ena 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "cabe5a5078ac8c506d3e4430763b1ba9b609b1286913e7d08e581d1c2de9b7e5"
"checksum encoding_c 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "769ecb8b33323998e482b218c0d13cd64c267609023b4b7ec3ee740714c318ee"
"checksum encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)" = "88a1b66a0d28af4b03a8c8278c6dcb90e6e600d89c14500a9e7a02e64b9ee3ac"
"checksum encoding_rs 0.8.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2a91912d6f37c6a8fef8a2316a862542d036f13c923ad518b5aca7bcaac7544c"
"checksum env_logger 0.5.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0561146661ae44c579e993456bc76d11ce1e0c7d745e57b2fa7146b6e49fa2ad"
"checksum error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ff511d5dc435d703f4971bc399647c9bc38e20cb41452e3b9feb4765419ed3f3"
"checksum euclid 0.19.0 (registry+https://github.com/rust-lang/crates.io-index)" = "70a2ebdf55fb9d6329046e026329a55ef8fbaae5ea833f56e170beb3125a8a5f"

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

2
third_party/rust/encoding_rs/Cargo.toml поставляемый
Просмотреть файл

@ -12,7 +12,7 @@
[package]
name = "encoding_rs"
version = "0.8.4"
version = "0.8.6"
authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
description = "A Gecko-oriented implementation of the Encoding Standard"
homepage = "https://docs.rs/encoding_rs/"

23
third_party/rust/encoding_rs/README.md поставляемый
Просмотреть файл

@ -68,6 +68,13 @@ Additionally, `encoding_rs::mem` does the following:
* Converts ASCII to UTF-16 up to the first non-ASCII byte.
* Converts UTF-16 to ASCII up to the first non-Basic Latin code unit.
## Integration with `std::io`
Notably, the above feature list doesn't include the capability to wrap
a `std::io::Read`, decode it into UTF-8 and presenting the result via
`std::io::Read`. The [`encoding_rs_io`](https://crates.io/crates/encoding_rs_io)
crate provides that capability.
## Licensing
Please see the file named
@ -237,6 +244,22 @@ used in Firefox.
## Release Notes
### 0.8.6
* Temporarily removed the debug assertion added in version 0.8.5 from
`convert_utf16_to_latin1_lossy`.
### 0.8.5
* If debug assertions are enabled but fuzzing isn't enabled, lossy conversions
to Latin1 in the `mem` module assert that the input is in the range
U+0000...U+00FF (inclusive).
* In the `mem` module provide conversions from Latin1 and UTF-16 to UTF-8
that can deal with insufficient output space. The idea is to use them
first with an allocation rounded up to jemalloc bucket size and do the
worst-case allocation only if the jemalloc rounding up was insufficient
as the first guess.
### 0.8.4
* Fix SSE2-specific, `simd-accel`-specific memory corruption introduced in

9
third_party/rust/encoding_rs/src/lib.rs поставляемый
Просмотреть файл

@ -8,7 +8,7 @@
// except according to those terms.
#![cfg_attr(feature = "cargo-clippy", allow(doc_markdown, inline_always, new_ret_no_self))]
#![doc(html_root_url = "https://docs.rs/encoding_rs/0.8.4")]
#![doc(html_root_url = "https://docs.rs/encoding_rs/0.8.6")]
//! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
//! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
@ -36,6 +36,13 @@
//! The [repository is on GitHub](https://github.com/hsivonen/encoding_rs). The
//! [crate is available on crates.io](https://crates.io/crates/encoding_rs).
//!
//! # Integration with `std::io`
//!
//! This crate doesn't implement traits from `std::io`. However, for the case of
//! wrapping a `std::io::Read` in a decoder that implements `std::io::Read` and
//! presents the data from the wrapped `std::io::Read` as UTF-8 is addressed by
//! the [`encoding_rs_io`](https://docs.rs/encoding_rs_io/) crate.
//!
//! # Examples
//!
//! Example programs:

261
third_party/rust/encoding_rs/src/mem.rs поставляемый
Просмотреть файл

@ -31,6 +31,10 @@ use super::EncoderResult;
use ascii::*;
use utf_8::*;
macro_rules! non_fuzz_debug_assert {
($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); })
}
cfg_if!{
if #[cfg(feature = "simd-accel")] {
use ::std::intrinsics::unlikely;
@ -1547,6 +1551,33 @@ pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
}
}
/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
/// with the REPLACEMENT CHARACTER with potentially insufficient output
/// space.
///
/// Returns the number of code units read and the number of bytes written.
///
/// Not all code units are read if there isn't enough output space.
///
/// Note that this method isn't designed for general streamability but for
/// not allocating memory for the worst case up front. Specifically,
/// if the input starts with or ends with an unpaired surrogate, those are
/// replaced with the REPLACEMENT CHARACTER.
///
/// # Safety
///
/// Note that this function may write garbage beyond the number of bytes
/// indicated by the return value, so using a `&mut str` interpreted as
/// `&mut [u8]` as the destination is not safe. If you want to convert into
/// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
#[inline]
pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
let mut encoder = Utf8Encoder;
let (result, read, written) = encoder.encode_from_utf16_raw(src, dst, true);
debug_assert!(result == EncoderResult::OutputFull || read == src.len());
(read, written)
}
/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
/// with the REPLACEMENT CHARACTER.
///
@ -1568,12 +1599,42 @@ pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
#[inline]
pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
assert!(dst.len() >= src.len() * 3 + 1);
let mut encoder = Utf8Encoder;
let (result, _, written) = encoder.encode_from_utf16_raw(src, dst, true);
debug_assert!(result == EncoderResult::InputEmpty);
let (read, written) = convert_utf16_to_utf8_partial(src, dst);
debug_assert_eq!(read, src.len());
written
}
/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
/// with the REPLACEMENT CHARACTER such that the validity of the output is
/// signaled using the Rust type system with potentially insufficient output
/// space.
///
/// Returns the number of code units read and the number of bytes written.
///
/// Not all code units are read if there isn't enough output space.
///
/// Note that this method isn't designed for general streamability but for
/// not allocating memory for the worst case up front. Specifically,
/// if the input starts with or ends with an unpaired surrogate, those are
/// replaced with the REPLACEMENT CHARACTER.
#[inline]
pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) {
let bytes: &mut [u8] = unsafe { ::std::mem::transmute(dst) };
let (read, written) = convert_utf16_to_utf8_partial(src, bytes);
let len = bytes.len();
let mut trail = written;
let max = ::std::cmp::min(len, trail + MAX_STRIDE_SIZE);
while trail < max {
bytes[trail] = 0;
trail += 1;
}
while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
bytes[trail] = 0;
trail += 1;
}
(read, written)
}
/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
/// with the REPLACEMENT CHARACTER such that the validity of the output is
/// signaled using the Rust type system.
@ -1588,19 +1649,9 @@ pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
/// Panics if the destination buffer is shorter than stated above.
#[inline]
pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize {
let bytes: &mut [u8] = unsafe { ::std::mem::transmute(dst) };
let written = convert_utf16_to_utf8(src, bytes);
let len = bytes.len();
let mut trail = written;
let max = ::std::cmp::min(len, trail + MAX_STRIDE_SIZE);
while trail < max {
bytes[trail] = 0;
trail += 1;
}
while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
bytes[trail] = 0;
trail += 1;
}
assert!(dst.len() >= src.len() * 3 + 1);
let (read, written) = convert_utf16_to_str_partial(src, dst);
debug_assert_eq!(read, src.len());
written
}
@ -1629,6 +1680,59 @@ pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) {
}
}
/// Converts bytes whose unsigned value is interpreted as Unicode code point
/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
/// output space.
///
/// Returns the number of bytes read and the number of bytes written.
///
/// If the output isn't large enough, not all input is consumed.
///
/// # Safety
///
/// Note that this function may write garbage beyond the number of bytes
/// indicated by the return value, so using a `&mut str` interpreted as
/// `&mut [u8]` as the destination is not safe. If you want to convert into
/// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
#[inline]
pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) {
let src_len = src.len();
let src_ptr = src.as_ptr();
let dst_ptr = dst.as_mut_ptr();
let dst_len = dst.len();
let mut total_read = 0usize;
let mut total_written = 0usize;
loop {
// src can't advance more than dst
let src_left = src_len - total_read;
let dst_left = dst_len - total_written;
let min_left = ::std::cmp::min(src_left, dst_left);
if let Some((non_ascii, consumed)) = unsafe {
ascii_to_ascii(
src_ptr.offset(total_read as isize),
dst_ptr.offset(total_written as isize),
min_left,
)
} {
total_read += consumed;
total_written += consumed;
if total_written.checked_add(2).unwrap() > dst_len {
return (total_read, total_written);
}
total_read += 1; // consume `non_ascii`
let code_point = non_ascii as u32;
dst[total_written] = ((code_point >> 6) | 0xC0u32) as u8;
total_written += 1;
dst[total_written] = ((code_point as u32 & 0x3Fu32) | 0x80u32) as u8;
total_written += 1;
continue;
}
return (total_read + min_left, total_written + min_left);
}
}
/// Converts bytes whose unsigned value is interpreted as Unicode code point
/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
///
@ -1653,33 +1757,35 @@ pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
dst.len() >= src.len() * 2,
"Destination must not be shorter than the source times two."
);
let src_len = src.len();
let src_ptr = src.as_ptr();
let dst_ptr = dst.as_mut_ptr();
let mut total_read = 0usize;
let mut total_written = 0usize;
loop {
// src can't advance more than dst
let src_left = src_len - total_read;
if let Some((non_ascii, consumed)) = unsafe {
ascii_to_ascii(
src_ptr.offset(total_read as isize),
dst_ptr.offset(total_written as isize),
src_left,
)
} {
total_read += consumed + 1;
total_written += consumed;
let (read, written) = convert_latin1_to_utf8_partial(src, dst);
debug_assert_eq!(read, src.len());
written
}
let code_point = non_ascii as u32;
dst[total_written] = ((code_point >> 6) | 0xC0u32) as u8;
total_written += 1;
dst[total_written] = ((code_point as u32 & 0x3Fu32) | 0x80u32) as u8;
total_written += 1;
continue;
}
return total_written + src_left;
/// Converts bytes whose unsigned value is interpreted as Unicode code point
/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
/// output is signaled using the Rust type system with potentially insufficient
/// output space.
///
/// Returns the number of bytes read and the number of bytes written.
///
/// If the output isn't large enough, not all input is consumed.
#[inline]
pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) {
let bytes: &mut [u8] = unsafe { ::std::mem::transmute(dst) };
let (read, written) = convert_latin1_to_utf8_partial(src, bytes);
let len = bytes.len();
let mut trail = written;
let max = ::std::cmp::min(len, trail + MAX_STRIDE_SIZE);
while trail < max {
bytes[trail] = 0;
trail += 1;
}
while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
bytes[trail] = 0;
trail += 1;
}
(read, written)
}
/// Converts bytes whose unsigned value is interpreted as Unicode code point
@ -1696,19 +1802,12 @@ pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
/// Panics if the destination buffer is shorter than stated above.
#[inline]
pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
let bytes: &mut [u8] = unsafe { ::std::mem::transmute(dst) };
let written = convert_latin1_to_utf8(src, bytes);
let len = bytes.len();
let mut trail = written;
let max = ::std::cmp::min(len, trail + MAX_STRIDE_SIZE);
while trail < max {
bytes[trail] = 0;
trail += 1;
}
while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
bytes[trail] = 0;
trail += 1;
}
assert!(
dst.len() >= src.len() * 2,
"Destination must not be shorter than the source times two."
);
let (read, written) = convert_latin1_to_str_partial(src, dst);
debug_assert_eq!(read, src.len());
written
}
@ -1718,6 +1817,7 @@ pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
/// each output byte.
///
/// If the input does not fulfill the condition stated above, this function
/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
/// does something that is memory-safe without any promises about any
/// properties of the output. In particular, callers shouldn't assume the
/// output to be the same across crate versions or CPU architectures and
@ -1731,12 +1831,16 @@ pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
/// # Panics
///
/// Panics if the destination buffer is shorter than stated above.
///
/// If debug assertions are enabled (and not fuzzing) and the input is
/// not in the range U+0000 to U+00FF, inclusive.
#[inline]
pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
assert!(
dst.len() >= src.len(),
"Destination must not be shorter than the source."
);
non_fuzz_debug_assert!(is_utf8_latin1(src));
let src_len = src.len();
let src_ptr = src.as_ptr();
let dst_ptr = dst.as_mut_ptr();
@ -1776,11 +1880,12 @@ pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
/// represents the value of each code point as the unsigned byte value of
/// each output byte.
///
/// If the input does not fulfill the condition stated above, this function
/// does something that is memory-safe without any promises about any
/// properties of the output. In particular, callers shouldn't assume the
/// output to be the same across crate versions or CPU architectures and
/// should not assume that non-Basic Latin input can't map to ASCII output.
/// If the input does not fulfill the condition stated above, does something
/// that is memory-safe without any promises about any properties of the
/// output and will probably assert in debug builds in future versions.
/// In particular, callers shouldn't assume the output to be the same across
/// crate versions or CPU architectures and should not assume that non-ASCII
/// input can't map to ASCII output.
///
/// The length of the destination buffer must be at least the length of the
/// source buffer.
@ -1790,12 +1895,16 @@ pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
/// # Panics
///
/// Panics if the destination buffer is shorter than stated above.
///
/// (Probably in future versions if debug assertions are enabled (and not
/// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
#[inline]
pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) {
assert!(
dst.len() >= src.len(),
"Destination must not be shorter than the source."
);
// non_fuzz_debug_assert!(is_utf16_latin1(src));
unsafe {
pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
}
@ -2100,6 +2209,18 @@ mod tests {
assert_eq!(dst, reference);
}
#[test]
fn test_convert_utf16_to_utf8_partial() {
let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
let src: Vec<u16> = reference.encode_utf16().collect();
let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
dst.resize(src.len() * 3 + 1, 0);
let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]);
let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]);
dst.truncate(len);
assert_eq!(dst, reference.as_bytes());
}
#[test]
fn test_convert_utf16_to_utf8() {
let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
@ -2127,6 +2248,14 @@ mod tests {
assert_eq!(dst, reference);
}
#[test]
fn test_convert_latin1_to_utf8_partial() {
let mut dst = [0u8, 2];
let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]);
assert_eq!(read, 1);
assert_eq!(written, 1);
}
#[test]
fn test_convert_latin1_to_utf8() {
let mut src: Vec<u8> = Vec::with_capacity(256);
@ -2163,6 +2292,13 @@ mod tests {
assert_eq!(dst, reference);
}
#[test]
#[should_panic]
fn test_convert_utf8_to_latin1_lossy_panics() {
let mut dst = [0u8; 16];
let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]);
}
#[test]
fn test_convert_utf16_to_latin1_lossy() {
let mut src: Vec<u16> = Vec::with_capacity(256);
@ -2179,6 +2315,13 @@ mod tests {
assert_eq!(dst, reference);
}
#[test]
// #[should_panic]
fn test_convert_utf16_to_latin1_lossy_panics() {
let mut dst = [0u8; 16];
let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]);
}
#[test]
fn test_utf16_valid_up_to() {
let valid = vec![