зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1261841 part 1 - Vendor encoding_rs and encoding_c into m-c. rs=emk,SimonSapin.
MozReview-Commit-ID: Lphq69tSIXa
This commit is contained in:
Родитель
854d241f4f
Коммит
5326afdaea
|
@ -0,0 +1 @@
|
|||
{"files":{".cargo-ok":"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",".gitignore":"3effb8c580299a86c7c816e456406be2763f0e0954c66fed2ce5ce06e750f997","CONTRIBUTING.md":"8cd9262df951c4b42078aa55064ca3b8ef2676c06b8fc7c281c02ee3f1ae04a8","COPYRIGHT":"65fb11bb8d2aac1ea00620273e0595ff71f4a335d25b67acbccbaa1b9ad5a409","Cargo.toml":"e7532cd5f9aba02726720ec8707914e6f5a8ce24401415233def34ec778d31c8","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"90df74ddb43e7f5aa5068890eacc151ecca7c997c9515cf17aea30b8734075bf","README.md":"1d360a54ac30d2ed84cada251eeaae0ef43c6f2b771856d6c181b592e8f6a471","build-disabled.rs":"d65ed45d33ce834ab9f9f7c5f308e0a72605aa34ede6dca45a2077a2deee5cfa","include/encoding_rs.h":"dc015596eb8b4b0b2e79569a519e81c14301db8f5b96b4013989645a67a73422","include/encoding_rs_cpp.h":"f93c0e2b3e1ec4f1efb1fcee1f43e8d1424faf3e26d7084404c5ba5f2f6a2c4d","include/encoding_rs_statics.h":"800e6aa5aafe2fa3a3826ed0c0a0da34ca9495ff9c75c84845d44b14f5be1078","src/lib.rs":"69ac99046085286c00534b6d107df269cfdd67fc488190d690d2d3e8c01bf916"},"package":"45ef700aebe8c5fb44f081a54ab400f4f6b002a426bc5332381c108f49713432"}
|
|
@ -0,0 +1,6 @@
|
|||
target
|
||||
Cargo.lock
|
||||
.project
|
||||
.settings
|
||||
*~
|
||||
*.bk
|
|
@ -0,0 +1,38 @@
|
|||
If you send a pull request / patch, please observe the following.
|
||||
|
||||
## Licensing
|
||||
|
||||
Since this crate is dual-licensed,
|
||||
[section 5 of the Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0#contributions)
|
||||
is considered to apply in the sense of Contributions being automatically
|
||||
under the Apache License 2.0 or MIT dual license (see the `COPYRIGHT` file).
|
||||
That is, by the act of offering a Contribution, you place your Contribution
|
||||
under the Apache License 2.0 or MIT dual license stated in the `COPYRIGHT`
|
||||
file. Please do not contribute if you aren't willing or allowed to license your
|
||||
contributions in this manner.
|
||||
|
||||
You are encouraged to dedicate test code that you contribute to the Public
|
||||
Domain using the CC0 dedication. If you contribute test code that is not
|
||||
dedicated to the Public Domain, please be sure not to put it in a part of
|
||||
source code that the comments designate as being dedicated to the Public
|
||||
Domain.
|
||||
|
||||
## Copyright Notices
|
||||
|
||||
If you require the addition of your copyright notice, it's up to you to edit in
|
||||
your notice as part of your Contribution. Not adding a copyright notice is
|
||||
taken as a waiver of copyright notice.
|
||||
|
||||
## Compatibility with Stable Rust
|
||||
|
||||
Please ensure that your Contribution compiles with the latest stable-channel
|
||||
rustc.
|
||||
|
||||
## rustfmt
|
||||
|
||||
Please install [`rustfmt`](https://github.com/rust-lang-nursery/rustfmt) 0.4.1
|
||||
(the latest version has
|
||||
[a bug](https://github.com/rust-lang-nursery/rustfmt/issues/1149) that renders
|
||||
it unsuited for encoding_rs) and run `cargo fmt` before creating a pull
|
||||
request. (It's OK for `cargo fmt` to exit with an error due to too long lines.)
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
encoding_c is copyright 2015-2017 Mozilla Foundation.
|
||||
|
||||
Licensed under the Apache License, Version 2.0
|
||||
<LICENSE-APACHE or
|
||||
https://www.apache.org/licenses/LICENSE-2.0> or the MIT
|
||||
license <LICENSE-MIT or https://opensource.org/licenses/MIT>,
|
||||
at your option. All files in the project carrying such
|
||||
notice may not be copied, modified, or distributed except
|
||||
according to those terms.
|
|
@ -0,0 +1,25 @@
|
|||
[package]
|
||||
name = "encoding_c"
|
||||
description = "C API for encoding_rs"
|
||||
version = "0.7.4"
|
||||
authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
|
||||
license = "MIT/Apache-2.0"
|
||||
readme = "README.md"
|
||||
documentation = "https://docs.rs/encoding_c/"
|
||||
homepage = "https://docs.rs/encoding_c/"
|
||||
repository = "https://github.com/hsivonen/encoding_c"
|
||||
keywords = ["ffi", "capi", "encoding", "unicode", "charset"]
|
||||
# Uncomment the line below and rename build-disabled.rs to build.rs to re-run cheddar.
|
||||
# build = "build.rs"
|
||||
|
||||
[features]
|
||||
simd-accel = ["encoding_rs/simd-accel"]
|
||||
no-static-ideograph-encoder-tables = ["encoding_rs/no-static-ideograph-encoder-tables"]
|
||||
parallel-utf8 = ["encoding_rs/parallel-utf8"]
|
||||
|
||||
[dependencies]
|
||||
encoding_rs = "0.6.11"
|
||||
|
||||
# Uncomment the lines below to re-run cheddar.
|
||||
# [build-dependencies]
|
||||
# rusty-cheddar = "0.3.3"
|
|
@ -0,0 +1,202 @@
|
|||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -0,0 +1,25 @@
|
|||
Copyright (c) 2015-2016 Mozilla Foundation
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
|
@ -0,0 +1,57 @@
|
|||
# encoding_c
|
||||
|
||||
[![crates.io](https://meritbadge.herokuapp.com/encoding_c)](https://crates.io/crates/encoding_c)
|
||||
[![docs.rs](https://docs.rs/encoding_c/badge.svg)](https://docs.rs/encoding_c/)
|
||||
[![Apache 2 / MIT dual-licensed](https://img.shields.io/badge/license-Apache%202%20%2F%20MIT-blue.svg)](https://github.com/hsivonen/encoding_c/blob/master/COPYRIGHT)
|
||||
|
||||
encoding_c is an FFI wrapper for [encoding_rs](https://github.com/hsivonen/encoding_rs).
|
||||
|
||||
## Licensing
|
||||
|
||||
Please see the file named
|
||||
[COPYRIGHT](https://github.com/hsivonen/encoding_c/blob/master/COPYRIGHT).
|
||||
|
||||
## C/C++ Headers
|
||||
|
||||
`include/encoding_rs.h` and `include/encoding_rs_statics.h` are needed for C
|
||||
usage.
|
||||
|
||||
`include/encoding_rs_cpp.h` is a sample C++ API built on top of the C API using
|
||||
GSL and the C++ standard library. Since C++ project typically roll their own
|
||||
string classes, etc., it's probably necessary for C++ projects to manually
|
||||
adapt the header to their replacements of standard-library types.
|
||||
|
||||
## Release Notes
|
||||
|
||||
### 0.7.4
|
||||
|
||||
* Wrap `has_pending_state()`.
|
||||
|
||||
### 0.7.3
|
||||
|
||||
* Use C preprocessor definitions for encoding constant declarations.
|
||||
|
||||
### 0.7.2
|
||||
|
||||
* Parametrize the struct type names behind C preprocessor definitions.
|
||||
* Leave it to the user to provide `char16_t`. Avoid including a header for it.
|
||||
|
||||
### 0.7.1
|
||||
|
||||
* Fix documentation for pointers that get used in
|
||||
`std::slice::from_raw_parts()`.
|
||||
|
||||
### 0.7.0
|
||||
|
||||
* Map `None` to `SIZE_MAX` in the max length calculation functions.
|
||||
|
||||
### 0.6.0
|
||||
|
||||
* Check in the `cheddar`-generated header and comment out the `cheddar`-using
|
||||
`build.rs`.
|
||||
|
||||
### 0.5.0
|
||||
|
||||
* Initial release of encoding_c. (I.e. first release with FFI in a distinct
|
||||
crate.)
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
// Copyright 2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
extern crate cheddar;
|
||||
|
||||
use std::io::prelude::*;
|
||||
use std::fs::File;
|
||||
|
||||
fn replace(path: &str) -> std::io::Result<()> {
|
||||
let mut f = try!(File::open(path));
|
||||
let mut s = String::new();
|
||||
try!(f.read_to_string(&mut s));
|
||||
s = s.replace("#ifndef cheddar_generated_encoding_rs_h", "// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
|
||||
// Instead, please regenerate using encoding_c/build.rs.
|
||||
|
||||
#ifndef cheddar_generated_encoding_rs_h");
|
||||
s = s.replace("uint16_t", "char16_t");
|
||||
s = s.replace("uintptr_t", "size_t");
|
||||
s = s.replace("Encoding", "ENCODING_RS_ENCODING");
|
||||
s = s.replace("Encoder", "ENCODING_RS_ENCODER");
|
||||
s = s.replace("Decoder", "ENCODING_RS_DECODER");
|
||||
s = s.replace("ENCODING_RS_ENCODING.html", "Encoding.html");
|
||||
s = s.replace("ENCODING_RS_ENCODER.html", "Encoder.html");
|
||||
s = s.replace("ENCODING_RS_DECODER.html", "Decoder.html");
|
||||
s = s.replace("#include <stdbool.h>",
|
||||
"#include <stdbool.h>\n#include \"encoding_rs_statics.h\"");
|
||||
let mut f = try!(File::create(path));
|
||||
try!(f.write_all(s.as_bytes()));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() {
|
||||
println!("cargo:rerun-if-changed=src/lib.rs");
|
||||
|
||||
let path = "include/encoding_rs.h";
|
||||
|
||||
cheddar::Cheddar::new()
|
||||
.expect("could not read manifest")
|
||||
.run_build(path);
|
||||
|
||||
match replace(path) {
|
||||
Ok(_) => {}
|
||||
Err(e) => println!("Performing replacements failed {}.", e),
|
||||
}
|
||||
}
|
|
@ -0,0 +1,639 @@
|
|||
|
||||
// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
|
||||
// Instead, please regenerate using encoding_c/build.rs.
|
||||
|
||||
#ifndef cheddar_generated_encoding_rs_h
|
||||
#define cheddar_generated_encoding_rs_h
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include "encoding_rs_statics.h"
|
||||
|
||||
|
||||
|
||||
/// Implements the
|
||||
/// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
|
||||
/// algorithm.
|
||||
///
|
||||
/// If, after ASCII-lowercasing and removing leading and trailing
|
||||
/// whitespace, the argument matches a label defined in the ENCODING_RS_ENCODING
|
||||
/// Standard, `const ENCODING_RS_ENCODING*` representing the corresponding
|
||||
/// encoding is returned. If there is no match, `NULL` is returned.
|
||||
///
|
||||
/// This is the right function to use if the action upon the method returning
|
||||
/// `NULL` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`) instead.
|
||||
/// When the action upon the method returning `NULL` is not to proceed with
|
||||
/// a fallback but to refuse processing, `encoding_for_label_no_replacement()` is
|
||||
/// more appropriate.
|
||||
///
|
||||
/// The argument buffer can be in any ASCII-compatible encoding. It is not
|
||||
/// required to be UTF-8.
|
||||
///
|
||||
/// `label` must be non-`NULL` even if `label_len` is zero. When `label_len`
|
||||
/// is zero, it is OK for `label` to be something non-dereferencable,
|
||||
/// such as `0x1`. This is required due to Rust's optimization for slices
|
||||
/// within `Option`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if `label` and `label_len` don't designate a valid memory block
|
||||
/// of if `label` is `NULL`.
|
||||
ENCODING_RS_ENCODING const* encoding_for_label(uint8_t const* label, size_t label_len);
|
||||
|
||||
/// This function behaves the same as `encoding_for_label()`, except when
|
||||
/// `encoding_for_label()` would return `REPLACEMENT_ENCODING`, this method
|
||||
/// returns `NULL` instead.
|
||||
///
|
||||
/// This method is useful in scenarios where a fatal error is required
|
||||
/// upon invalid label, because in those cases the caller typically wishes
|
||||
/// to treat the labels that map to the replacement encoding as fatal
|
||||
/// errors, too.
|
||||
///
|
||||
/// It is not OK to use this funciton when the action upon the method returning
|
||||
/// `NULL` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
|
||||
/// such a case, the `encoding_for_label()` function should be used instead
|
||||
/// in order to avoid unsafe fallback for labels that `encoding_for_label()`
|
||||
/// maps to `REPLACEMENT_ENCODING`.
|
||||
///
|
||||
/// The argument buffer can be in any ASCII-compatible encoding. It is not
|
||||
/// required to be UTF-8.
|
||||
///
|
||||
/// `label` must be non-`NULL` even if `label_len` is zero. When `label_len`
|
||||
/// is zero, it is OK for `label` to be something non-dereferencable,
|
||||
/// such as `0x1`. This is required due to Rust's optimization for slices
|
||||
/// within `Option`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if `label` and `label_len` don't designate a valid memory block
|
||||
/// of if `label` is `NULL`.
|
||||
ENCODING_RS_ENCODING const* encoding_for_label_no_replacement(uint8_t const* label, size_t label_len);
|
||||
|
||||
/// Performs non-incremental BOM sniffing.
|
||||
///
|
||||
/// The argument must either be a buffer representing the entire input
|
||||
/// stream (non-streaming case) or a buffer representing at least the first
|
||||
/// three bytes of the input stream (streaming case).
|
||||
///
|
||||
/// Returns `UTF_8_ENCODING`, `UTF_16LE_ENCODING` or `UTF_16BE_ENCODING` if the
|
||||
/// argument starts with the UTF-8, UTF-16LE or UTF-16BE BOM or `NULL`
|
||||
/// otherwise. Upon return, `*buffer_len` is the length of the BOM (zero if
|
||||
/// there is no BOM).
|
||||
///
|
||||
/// `buffer` must be non-`NULL` even if `*buffer_len` is zero. When
|
||||
/// `*buffer_len` is zero, it is OK for `buffer` to be something
|
||||
/// non-dereferencable, such as `0x1`. This is required due to Rust's
|
||||
/// optimization for slices within `Option`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if `buffer` and `*buffer_len` don't designate a valid memory
|
||||
/// block of if `buffer` is `NULL`.
|
||||
ENCODING_RS_ENCODING const* encoding_for_bom(uint8_t const* buffer, size_t* buffer_len);
|
||||
|
||||
/// If the argument matches exactly (case-sensitively; no whitespace
|
||||
/// removal performed) the name of an encoding, returns
|
||||
/// `const ENCODING_RS_ENCODING*` representing that encoding. Otherwise panics.
|
||||
///
|
||||
/// The motivating use case for this function is interoperability with
|
||||
/// legacy Gecko code that represents encodings as name string instead of
|
||||
/// type-safe `ENCODING_RS_ENCODING` objects. Using this function for other purposes is
|
||||
/// most likely the wrong thing to do.
|
||||
///
|
||||
/// `name` must be non-`NULL` even if `name_len` is zero. When `name_len`
|
||||
/// is zero, it is OK for `name` to be something non-dereferencable,
|
||||
/// such as `0x1`. This is required due to Rust's optimization for slices
|
||||
/// within `Option`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the argument is not the name of an encoding.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if `name` and `name_len` don't designate a valid memory block
|
||||
/// of if `name` is `NULL`.
|
||||
ENCODING_RS_ENCODING const* encoding_for_name(uint8_t const* name, size_t name_len);
|
||||
|
||||
/// Writes the name of the given `ENCODING_RS_ENCODING` to a caller-supplied buffer as
|
||||
/// ASCII and returns the number of bytes / ASCII characters written.
|
||||
///
|
||||
/// The output is not null-terminated.
|
||||
///
|
||||
/// The caller _MUST_ ensure that `name_out` points to a buffer whose length
|
||||
/// is at least `ENCODING_NAME_MAX_LENGTH` bytes.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if either argument is `NULL` or if `name_out` doesn't point to
|
||||
/// a valid block of memory whose length is at least
|
||||
/// `ENCODING_NAME_MAX_LENGTH` bytes.
|
||||
size_t encoding_name(ENCODING_RS_ENCODING const* encoding, uint8_t* name_out);
|
||||
|
||||
/// Checks whether the _output encoding_ of this encoding can encode every
|
||||
/// Unicode scalar. (Only true if the output encoding is UTF-8.)
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if the argument is `NULL`.
|
||||
bool encoding_can_encode_everything(ENCODING_RS_ENCODING const* encoding);
|
||||
|
||||
/// Checks whether the bytes 0x00...0x7F map exclusively to the characters
|
||||
/// U+0000...U+007F and vice versa.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if the argument is `NULL`.
|
||||
bool encoding_is_ascii_compatible(ENCODING_RS_ENCODING const* encoding);
|
||||
|
||||
/// Returns the _output encoding_ of this encoding. This is UTF-8 for
|
||||
/// UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if the argument is `NULL`.
|
||||
ENCODING_RS_ENCODING const* encoding_output_encoding(ENCODING_RS_ENCODING const* encoding);
|
||||
|
||||
/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING` on the heap with BOM
|
||||
/// sniffing enabled and returns a pointer to the newly-allocated `ENCODING_RS_DECODER`.
|
||||
///
|
||||
/// BOM sniffing may cause the returned decoder to morph into a decoder
|
||||
/// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
|
||||
///
|
||||
/// Once the allocated `ENCODING_RS_DECODER` is no longer needed, the caller _MUST_
|
||||
/// deallocate it by passing the pointer returned by this function to
|
||||
/// `decoder_free()`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if the argument is `NULL`.
|
||||
ENCODING_RS_DECODER* encoding_new_decoder(ENCODING_RS_ENCODING const* encoding);
|
||||
|
||||
/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING` on the heap with BOM
|
||||
/// removal and returns a pointer to the newly-allocated `ENCODING_RS_DECODER`.
|
||||
///
|
||||
/// If the input starts with bytes that are the BOM for this encoding,
|
||||
/// those bytes are removed. However, the decoder never morphs into a
|
||||
/// decoder for another encoding: A BOM for another encoding is treated as
|
||||
/// (potentially malformed) input to the decoding algorithm for this
|
||||
/// encoding.
|
||||
///
|
||||
/// Once the allocated `ENCODING_RS_DECODER` is no longer needed, the caller _MUST_
|
||||
/// deallocate it by passing the pointer returned by this function to
|
||||
/// `decoder_free()`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if the argument is `NULL`.
|
||||
ENCODING_RS_DECODER* encoding_new_decoder_with_bom_removal(ENCODING_RS_ENCODING const* encoding);
|
||||
|
||||
/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING` on the heap with BOM
|
||||
/// handling disabled and returns a pointer to the newly-allocated `ENCODING_RS_DECODER`.
|
||||
///
|
||||
/// If the input starts with bytes that look like a BOM, those bytes are
|
||||
/// not treated as a BOM. (Hence, the decoder never morphs into a decoder
|
||||
/// for another encoding.)
|
||||
///
|
||||
/// _Note:_ If the caller has performed BOM sniffing on its own but has not
|
||||
/// removed the BOM, the caller should use
|
||||
/// `encoding_new_decoder_with_bom_removal()` instead of this function to cause
|
||||
/// the BOM to be removed.
|
||||
///
|
||||
/// Once the allocated `ENCODING_RS_DECODER` is no longer needed, the caller _MUST_
|
||||
/// deallocate it by passing the pointer returned by this function to
|
||||
/// `decoder_free()`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if the argument is `NULL`.
|
||||
ENCODING_RS_DECODER* encoding_new_decoder_without_bom_handling(ENCODING_RS_ENCODING const* encoding);
|
||||
|
||||
/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING` into memory provided by
|
||||
/// the caller with BOM sniffing enabled. (In practice, the target should
|
||||
/// likely be a pointer previously returned by `encoding_new_decoder()`.)
|
||||
///
|
||||
/// Note: If the caller has already performed BOM sniffing but has
|
||||
/// not removed the BOM, the caller should still use this function in
|
||||
/// order to cause the BOM to be ignored.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if either argument is `NULL`.
|
||||
void encoding_new_decoder_into(ENCODING_RS_ENCODING const* encoding, ENCODING_RS_DECODER* decoder);
|
||||
|
||||
/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING` into memory provided by
|
||||
/// the caller with BOM removal.
|
||||
///
|
||||
/// If the input starts with bytes that are the BOM for this encoding,
|
||||
/// those bytes are removed. However, the decoder never morphs into a
|
||||
/// decoder for another encoding: A BOM for another encoding is treated as
|
||||
/// (potentially malformed) input to the decoding algorithm for this
|
||||
/// encoding.
|
||||
///
|
||||
/// Once the allocated `ENCODING_RS_DECODER` is no longer needed, the caller _MUST_
|
||||
/// deallocate it by passing the pointer returned by this function to
|
||||
/// `decoder_free()`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if either argument is `NULL`.
|
||||
void encoding_new_decoder_with_bom_removal_into(ENCODING_RS_ENCODING const* encoding, ENCODING_RS_DECODER* decoder);
|
||||
|
||||
/// Allocates a new `ENCODING_RS_DECODER` for the given `ENCODING_RS_ENCODING` into memory provided by
|
||||
/// the caller with BOM handling disabled.
|
||||
///
|
||||
/// If the input starts with bytes that look like a BOM, those bytes are
|
||||
/// not treated as a BOM. (Hence, the decoder never morphs into a decoder
|
||||
/// for another encoding.)
|
||||
///
|
||||
/// _Note:_ If the caller has performed BOM sniffing on its own but has not
|
||||
/// removed the BOM, the caller should use
|
||||
/// `encoding_new_decoder_with_bom_removal_into()` instead of this function to
|
||||
/// cause the BOM to be removed.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if either argument is `NULL`.
|
||||
void encoding_new_decoder_without_bom_handling_into(ENCODING_RS_ENCODING const* encoding, ENCODING_RS_DECODER* decoder);
|
||||
|
||||
/// Allocates a new `ENCODING_RS_ENCODER` for the given `ENCODING_RS_ENCODING` on the heap and returns a
|
||||
/// pointer to the newly-allocated `ENCODING_RS_ENCODER`. (Exception, if the `ENCODING_RS_ENCODING` is
|
||||
/// `replacement`, a new `ENCODING_RS_DECODER` for UTF-8 is instantiated (and that
|
||||
/// `ENCODING_RS_DECODER` reports `UTF_8` as its `ENCODING_RS_ENCODING`).
|
||||
///
|
||||
/// Once the allocated `ENCODING_RS_ENCODER` is no longer needed, the caller _MUST_
|
||||
/// deallocate it by passing the pointer returned by this function to
|
||||
/// `encoder_free()`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if the argument is `NULL`.
|
||||
ENCODING_RS_ENCODER* encoding_new_encoder(ENCODING_RS_ENCODING const* encoding);
|
||||
|
||||
/// Allocates a new `ENCODING_RS_ENCODER` for the given `ENCODING_RS_ENCODING` into memory provided by
|
||||
/// the caller. (In practice, the target should likely be a pointer previously
|
||||
/// returned by `encoding_new_encoder()`.)
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if either argument is `NULL`.
|
||||
void encoding_new_encoder_into(ENCODING_RS_ENCODING const* encoding, ENCODING_RS_ENCODER* encoder);
|
||||
|
||||
/// Validates UTF-8.
|
||||
///
|
||||
/// Returns the index of the first byte that makes the input malformed as
|
||||
/// UTF-8 or `buffer_len` if `buffer` is entirely valid.
|
||||
///
|
||||
/// `buffer` must be non-`NULL` even if `buffer_len` is zero. When
|
||||
/// `buffer_len` is zero, it is OK for `buffer` to be something
|
||||
/// non-dereferencable, such as `0x1`. This is required due to Rust's
|
||||
/// optimization for slices within `Option`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory
|
||||
/// block of if `buffer` is `NULL`.
|
||||
size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
|
||||
|
||||
/// Validates ASCII.
|
||||
///
|
||||
/// Returns the index of the first byte that makes the input malformed as
|
||||
/// ASCII or `buffer_len` if `buffer` is entirely valid.
|
||||
///
|
||||
/// `buffer` must be non-`NULL` even if `buffer_len` is zero. When
|
||||
/// `buffer_len` is zero, it is OK for `buffer` to be something
|
||||
/// non-dereferencable, such as `0x1`. This is required due to Rust's
|
||||
/// optimization for slices within `Option`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory
|
||||
/// block of if `buffer` is `NULL`.
|
||||
size_t encoding_ascii_valid_up_to(uint8_t const* buffer, size_t buffer_len);
|
||||
|
||||
/// Validates ISO-2022-JP ASCII-state data.
|
||||
///
|
||||
/// Returns the index of the first byte that makes the input not representable
|
||||
/// in the ASCII state of ISO-2022-JP or `buffer_len` if `buffer` is entirely
|
||||
/// representable in the ASCII state of ISO-2022-JP.
|
||||
///
|
||||
/// `buffer` must be non-`NULL` even if `buffer_len` is zero. When
|
||||
/// `buffer_len` is zero, it is OK for `buffer` to be something
|
||||
/// non-dereferencable, such as `0x1`. This is required due to Rust's
|
||||
/// optimization for slices within `Option`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if `buffer` and `buffer_len` don't designate a valid memory
|
||||
/// block of if `buffer` is `NULL`.
|
||||
size_t encoding_iso_2022_jp_ascii_valid_up_to(uint8_t const* buffer, size_t buffer_len);
|
||||
|
||||
/// Deallocates a `ENCODING_RS_DECODER` previously allocated by `encoding_new_decoder()`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if the argument is `NULL`.
|
||||
void decoder_free(ENCODING_RS_DECODER* decoder);
|
||||
|
||||
/// The `ENCODING_RS_ENCODING` this `ENCODING_RS_DECODER` is for.
|
||||
///
|
||||
/// BOM sniffing can change the return value of this method during the life
|
||||
/// of the decoder.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if the argument is `NULL`.
|
||||
ENCODING_RS_ENCODING const* decoder_encoding(ENCODING_RS_DECODER const* decoder);
|
||||
|
||||
/// Query the worst-case UTF-8 output size _with replacement_.
|
||||
///
|
||||
/// Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
|
||||
/// that will not overflow given the current state of the decoder and
|
||||
/// `byte_length` number of additional input bytes when decoding with
|
||||
/// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
|
||||
/// sequence or `SIZE_MAX` if `size_t` would overflow.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if `decoder` is `NULL`.
|
||||
size_t decoder_max_utf8_buffer_length(ENCODING_RS_DECODER const* decoder, size_t byte_length);
|
||||
|
||||
/// Query the worst-case UTF-8 output size _without replacement_.
|
||||
///
|
||||
/// Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
|
||||
/// that will not overflow given the current state of the decoder and
|
||||
/// `byte_length` number of additional input bytes when decoding without
|
||||
/// replacement error handling or `SIZE_MAX` if `size_t` would overflow.
|
||||
///
|
||||
/// Note that this value may be too small for the `_with_replacement` case.
|
||||
/// Use `decoder_max_utf8_buffer_length()` for that case.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if `decoder` is `NULL`.
|
||||
size_t decoder_max_utf8_buffer_length_without_replacement(ENCODING_RS_DECODER const* decoder, size_t byte_length);
|
||||
|
||||
/// Incrementally decode a byte stream into UTF-8 with malformed sequences
|
||||
/// replaced with the REPLACEMENT CHARACTER.
|
||||
///
|
||||
/// See the top-level FFI documentation for documentation for how the
|
||||
/// `decoder_decode_*` functions are mapped from Rust and the documentation
|
||||
/// for the [`ENCODING_RS_DECODER`][1] struct for the semantics.
|
||||
///
|
||||
/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
|
||||
/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
|
||||
/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
|
||||
/// optimization for slices within `Option`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
|
||||
/// don't designate a valid block of memory or `dst` and `dst_len` don't
|
||||
/// designate a valid block of memory.
|
||||
///
|
||||
/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html
|
||||
uint32_t decoder_decode_to_utf8(ENCODING_RS_DECODER* decoder, uint8_t const* src, size_t* src_len, uint8_t* dst, size_t* dst_len, bool last, bool* had_replacements);
|
||||
|
||||
/// Incrementally decode a byte stream into UTF-8 _without replacement_.
|
||||
///
|
||||
/// See the top-level FFI documentation for documentation for how the
|
||||
/// `decoder_decode_*` functions are mapped from Rust and the documentation
|
||||
/// for the [`ENCODING_RS_DECODER`][1] struct for the semantics.
|
||||
///
|
||||
/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
|
||||
/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
|
||||
/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
|
||||
/// optimization for slices within `Option`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
|
||||
/// don't designate a valid block of memory or `dst` and `dst_len` don't
|
||||
/// designate a valid block of memory.
|
||||
///
|
||||
/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html
|
||||
uint32_t decoder_decode_to_utf8_without_replacement(ENCODING_RS_DECODER* decoder, uint8_t const* src, size_t* src_len, uint8_t* dst, size_t* dst_len, bool last);
|
||||
|
||||
/// Query the worst-case UTF-16 output size (with or without replacement).
|
||||
///
|
||||
/// Returns the size of the output buffer in UTF-16 code units (`char16_t`)
|
||||
/// that will not overflow given the current state of the decoder and
|
||||
/// `byte_length` number of additional input bytes or `SIZE_MAX` if `size_t`
|
||||
/// would overflow.
|
||||
///
|
||||
/// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
|
||||
/// return value of this method applies also in the
|
||||
/// `_without_replacement` case.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if `decoder` is `NULL`.
|
||||
size_t decoder_max_utf16_buffer_length(ENCODING_RS_DECODER const* decoder, size_t u16_length);
|
||||
|
||||
/// Incrementally decode a byte stream into UTF-16 with malformed sequences
|
||||
/// replaced with the REPLACEMENT CHARACTER.
|
||||
///
|
||||
/// See the top-level FFI documentation for documentation for how the
|
||||
/// `decoder_decode_*` functions are mapped from Rust and the documentation
|
||||
/// for the [`ENCODING_RS_DECODER`][1] struct for the semantics.
|
||||
///
|
||||
/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
|
||||
/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
|
||||
/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
|
||||
/// optimization for slices within `Option`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
|
||||
/// don't designate a valid block of memory or `dst` and `dst_len` don't
|
||||
/// designate a valid block of memory.
|
||||
///
|
||||
/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html
|
||||
uint32_t decoder_decode_to_utf16(ENCODING_RS_DECODER* decoder, uint8_t const* src, size_t* src_len, char16_t* dst, size_t* dst_len, bool last, bool* had_replacements);
|
||||
|
||||
/// Incrementally decode a byte stream into UTF-16 _without replacement_.
|
||||
///
|
||||
/// See the top-level FFI documentation for documentation for how the
|
||||
/// `decoder_decode_*` functions are mapped from Rust and the documentation
|
||||
/// for the [`ENCODING_RS_DECODER`][1] struct for the semantics.
|
||||
///
|
||||
/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
|
||||
/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
|
||||
/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
|
||||
/// optimization for slices within `Option`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
|
||||
/// don't designate a valid block of memory or `dst` and `dst_len` don't
|
||||
/// designate a valid block of memory.
|
||||
///
|
||||
/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Decoder.html
|
||||
uint32_t decoder_decode_to_utf16_without_replacement(ENCODING_RS_DECODER* decoder, uint8_t const* src, size_t* src_len, char16_t* dst, size_t* dst_len, bool last);
|
||||
|
||||
/// Deallocates an `ENCODING_RS_ENCODER` previously allocated by `encoding_new_encoder()`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if the argument is `NULL`.
|
||||
void encoder_free(ENCODING_RS_ENCODER* encoder);
|
||||
|
||||
/// The `ENCODING_RS_ENCODING` this `ENCODING_RS_ENCODER` is for.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if the argument is `NULL`.
|
||||
ENCODING_RS_ENCODING const* encoder_encoding(ENCODING_RS_ENCODER const* encoder);
|
||||
|
||||
/// Returns `true` if this is an ISO-2022-JP encoder that's not in the
|
||||
/// ASCII state and `false` otherwise.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if the argument is `NULL`.
|
||||
bool encoder_has_pending_state(ENCODING_RS_ENCODER const* encoder);
|
||||
|
||||
/// Query the worst-case output size when encoding from UTF-8 with
|
||||
/// replacement.
|
||||
///
|
||||
/// Returns the size of the output buffer in bytes that will not overflow
|
||||
/// given the current state of the encoder and `byte_length` number of
|
||||
/// additional input code units if there are no unmappable characters in
|
||||
/// the input or `SIZE_MAX` if `size_t` would overflow.
|
||||
size_t encoder_max_buffer_length_from_utf8_if_no_unmappables(ENCODING_RS_ENCODER const* encoder, size_t byte_length);
|
||||
|
||||
/// Query the worst-case output size when encoding from UTF-8 without
|
||||
/// replacement.
|
||||
///
|
||||
/// Returns the size of the output buffer in bytes that will not overflow
|
||||
/// given the current state of the encoder and `byte_length` number of
|
||||
/// additional input code units or `SIZE_MAX` if `size_t` would overflow.
|
||||
size_t encoder_max_buffer_length_from_utf8_without_replacement(ENCODING_RS_ENCODER const* encoder, size_t byte_length);
|
||||
|
||||
/// Incrementally encode into byte stream from UTF-8 with unmappable
|
||||
/// characters replaced with HTML (decimal) numeric character references.
|
||||
///
|
||||
/// The input absolutely _MUST_ be valid UTF-8 or the behavior is memory-unsafe!
|
||||
/// If in doubt, check the validity of input before using!
|
||||
///
|
||||
/// See the top-level FFI documentation for documentation for how the
|
||||
/// `encoder_encode_*` functions are mapped from Rust and the documentation
|
||||
/// for the [`ENCODING_RS_ENCODER`][1] struct for the semantics.
|
||||
///
|
||||
/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
|
||||
/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
|
||||
/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
|
||||
/// optimization for slices within `Option`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
|
||||
/// don't designate a valid block of memory or `dst` and `dst_len` don't
|
||||
/// designate a valid block of memory.
|
||||
///
|
||||
/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html
|
||||
uint32_t encoder_encode_from_utf8(ENCODING_RS_ENCODER* encoder, uint8_t const* src, size_t* src_len, uint8_t* dst, size_t* dst_len, bool last, bool* had_replacements);
|
||||
|
||||
/// Incrementally encode into byte stream from UTF-8 _without replacement_.
|
||||
///
|
||||
/// See the top-level FFI documentation for documentation for how the
|
||||
/// `encoder_encode_*` functions are mapped from Rust and the documentation
|
||||
/// for the [`ENCODING_RS_ENCODER`][1] struct for the semantics.
|
||||
///
|
||||
/// The input absolutely _MUST_ be valid UTF-8 or the behavior is memory-unsafe!
|
||||
/// If in doubt, check the validity of input before using!
|
||||
///
|
||||
/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
|
||||
/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
|
||||
/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
|
||||
/// optimization for slices within `Option`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
|
||||
/// don't designate a valid block of memory or `dst` and `dst_len` don't
|
||||
/// designate a valid block of memory.
|
||||
///
|
||||
/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html
|
||||
uint32_t encoder_encode_from_utf8_without_replacement(ENCODING_RS_ENCODER* encoder, uint8_t const* src, size_t* src_len, uint8_t* dst, size_t* dst_len, bool last);
|
||||
|
||||
/// Query the worst-case output size when encoding from UTF-16 with
|
||||
/// replacement.
|
||||
///
|
||||
/// Returns the size of the output buffer in bytes that will not overflow
|
||||
/// given the current state of the encoder and `u16_length` number of
|
||||
/// additional input code units if there are no unmappable characters in
|
||||
/// the input or `SIZE_MAX` if `size_t` would overflow.
|
||||
size_t encoder_max_buffer_length_from_utf16_if_no_unmappables(ENCODING_RS_ENCODER const* encoder, size_t u16_length);
|
||||
|
||||
/// Query the worst-case output size when encoding from UTF-16 without
|
||||
/// replacement.
|
||||
///
|
||||
/// Returns the size of the output buffer in bytes that will not overflow
|
||||
/// given the current state of the encoder and `u16_length` number of
|
||||
/// additional input code units or `SIZE_MAX` if `size_t` would overflow.
|
||||
size_t encoder_max_buffer_length_from_utf16_without_replacement(ENCODING_RS_ENCODER const* encoder, size_t u16_length);
|
||||
|
||||
/// Incrementally encode into byte stream from UTF-16 with unmappable
|
||||
/// characters replaced with HTML (decimal) numeric character references.
|
||||
///
|
||||
/// See the top-level FFI documentation for documentation for how the
|
||||
/// `encoder_encode_*` functions are mapped from Rust and the documentation
|
||||
/// for the [`ENCODING_RS_ENCODER`][1] struct for the semantics.
|
||||
///
|
||||
/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
|
||||
/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
|
||||
/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
|
||||
/// optimization for slices within `Option`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
|
||||
/// don't designate a valid block of memory or `dst` and `dst_len` don't
|
||||
/// designate a valid block of memory.
|
||||
///
|
||||
/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html
|
||||
uint32_t encoder_encode_from_utf16(ENCODING_RS_ENCODER* encoder, char16_t const* src, size_t* src_len, uint8_t* dst, size_t* dst_len, bool last, bool* had_replacements);
|
||||
|
||||
/// Incrementally encode into byte stream from UTF-16 _without replacement_.
|
||||
///
|
||||
/// See the top-level FFI documentation for documentation for how the
|
||||
/// `encoder_encode_*` functions are mapped from Rust and the documentation
|
||||
/// for the [`ENCODING_RS_ENCODER`][1] struct for the semantics.
|
||||
///
|
||||
/// `src` must be non-`NULL` even if `src_len` is zero. When`src_len` is zero,
|
||||
/// it is OK for `src` to be something non-dereferencable, such as `0x1`.
|
||||
/// Likewise for `dst` when `dst_len` is zero. This is required due to Rust's
|
||||
/// optimization for slices within `Option`.
|
||||
///
|
||||
/// # Undefined behavior
|
||||
///
|
||||
/// UB ensues if any of the pointer arguments is `NULL`, `src` and `src_len`
|
||||
/// don't designate a valid block of memory or `dst` and `dst_len` don't
|
||||
/// designate a valid block of memory.
|
||||
///
|
||||
/// [1]: https://docs.rs/encoding_rs/0.6.10/encoding_rs/struct.Encoder.html
|
||||
uint32_t encoder_encode_from_utf16_without_replacement(ENCODING_RS_ENCODER* encoder, char16_t const* src, size_t* src_len, uint8_t* dst, size_t* dst_len, bool last);
|
||||
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,167 @@
|
|||
// Copyright 2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
|
||||
// Instead, please regenerate using generate-encoding-data.py
|
||||
|
||||
// This file is not meant to be included directly. Instead, encoding_rs.h
|
||||
// includes this file.
|
||||
|
||||
#ifndef encoding_rs_statics_h_
|
||||
#define encoding_rs_statics_h_
|
||||
|
||||
#ifndef ENCODING_RS_ENCODING
|
||||
#define ENCODING_RS_ENCODING Encoding
|
||||
#ifndef __cplusplus
|
||||
typedef struct Encoding_ Encoding;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef ENCODING_RS_ENCODER
|
||||
#define ENCODING_RS_ENCODER Encoder
|
||||
#ifndef __cplusplus
|
||||
typedef struct Encoder_ Encoder;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef ENCODING_RS_DECODER
|
||||
#define ENCODING_RS_DECODER Decoder
|
||||
#ifndef __cplusplus
|
||||
typedef struct Decoder_ Decoder;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define INPUT_EMPTY 0
|
||||
|
||||
#define OUTPUT_FULL 0xFFFFFFFF
|
||||
|
||||
// x-mac-cyrillic
|
||||
#define ENCODING_NAME_MAX_LENGTH 14
|
||||
|
||||
/// The Big5 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const BIG5_ENCODING;
|
||||
|
||||
/// The EUC-JP encoding.
|
||||
extern const ENCODING_RS_ENCODING* const EUC_JP_ENCODING;
|
||||
|
||||
/// The EUC-KR encoding.
|
||||
extern const ENCODING_RS_ENCODING* const EUC_KR_ENCODING;
|
||||
|
||||
/// The GBK encoding.
|
||||
extern const ENCODING_RS_ENCODING* const GBK_ENCODING;
|
||||
|
||||
/// The IBM866 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const IBM866_ENCODING;
|
||||
|
||||
/// The ISO-2022-JP encoding.
|
||||
extern const ENCODING_RS_ENCODING* const ISO_2022_JP_ENCODING;
|
||||
|
||||
/// The ISO-8859-10 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const ISO_8859_10_ENCODING;
|
||||
|
||||
/// The ISO-8859-13 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const ISO_8859_13_ENCODING;
|
||||
|
||||
/// The ISO-8859-14 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const ISO_8859_14_ENCODING;
|
||||
|
||||
/// The ISO-8859-15 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const ISO_8859_15_ENCODING;
|
||||
|
||||
/// The ISO-8859-16 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const ISO_8859_16_ENCODING;
|
||||
|
||||
/// The ISO-8859-2 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const ISO_8859_2_ENCODING;
|
||||
|
||||
/// The ISO-8859-3 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const ISO_8859_3_ENCODING;
|
||||
|
||||
/// The ISO-8859-4 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const ISO_8859_4_ENCODING;
|
||||
|
||||
/// The ISO-8859-5 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const ISO_8859_5_ENCODING;
|
||||
|
||||
/// The ISO-8859-6 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const ISO_8859_6_ENCODING;
|
||||
|
||||
/// The ISO-8859-7 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const ISO_8859_7_ENCODING;
|
||||
|
||||
/// The ISO-8859-8 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const ISO_8859_8_ENCODING;
|
||||
|
||||
/// The ISO-8859-8-I encoding.
|
||||
extern const ENCODING_RS_ENCODING* const ISO_8859_8_I_ENCODING;
|
||||
|
||||
/// The KOI8-R encoding.
|
||||
extern const ENCODING_RS_ENCODING* const KOI8_R_ENCODING;
|
||||
|
||||
/// The KOI8-U encoding.
|
||||
extern const ENCODING_RS_ENCODING* const KOI8_U_ENCODING;
|
||||
|
||||
/// The Shift_JIS encoding.
|
||||
extern const ENCODING_RS_ENCODING* const SHIFT_JIS_ENCODING;
|
||||
|
||||
/// The UTF-16BE encoding.
|
||||
extern const ENCODING_RS_ENCODING* const UTF_16BE_ENCODING;
|
||||
|
||||
/// The UTF-16LE encoding.
|
||||
extern const ENCODING_RS_ENCODING* const UTF_16LE_ENCODING;
|
||||
|
||||
/// The UTF-8 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const UTF_8_ENCODING;
|
||||
|
||||
/// The gb18030 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const GB18030_ENCODING;
|
||||
|
||||
/// The macintosh encoding.
|
||||
extern const ENCODING_RS_ENCODING* const MACINTOSH_ENCODING;
|
||||
|
||||
/// The replacement encoding.
|
||||
extern const ENCODING_RS_ENCODING* const REPLACEMENT_ENCODING;
|
||||
|
||||
/// The windows-1250 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const WINDOWS_1250_ENCODING;
|
||||
|
||||
/// The windows-1251 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const WINDOWS_1251_ENCODING;
|
||||
|
||||
/// The windows-1252 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const WINDOWS_1252_ENCODING;
|
||||
|
||||
/// The windows-1253 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const WINDOWS_1253_ENCODING;
|
||||
|
||||
/// The windows-1254 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const WINDOWS_1254_ENCODING;
|
||||
|
||||
/// The windows-1255 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const WINDOWS_1255_ENCODING;
|
||||
|
||||
/// The windows-1256 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const WINDOWS_1256_ENCODING;
|
||||
|
||||
/// The windows-1257 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const WINDOWS_1257_ENCODING;
|
||||
|
||||
/// The windows-1258 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const WINDOWS_1258_ENCODING;
|
||||
|
||||
/// The windows-874 encoding.
|
||||
extern const ENCODING_RS_ENCODING* const WINDOWS_874_ENCODING;
|
||||
|
||||
/// The x-mac-cyrillic encoding.
|
||||
extern const ENCODING_RS_ENCODING* const X_MAC_CYRILLIC_ENCODING;
|
||||
|
||||
/// The x-user-defined encoding.
|
||||
extern const ENCODING_RS_ENCODING* const X_USER_DEFINED_ENCODING;
|
||||
|
||||
#endif // encoding_rs_statics_h_
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -0,0 +1,10 @@
|
|||
target
|
||||
Cargo.lock
|
||||
.project
|
||||
.settings
|
||||
*~
|
||||
*.bk
|
||||
fuzz/target
|
||||
fuzz/Cargo.lock
|
||||
fuzz/artifacts
|
||||
fuzz/corpus
|
|
@ -0,0 +1,8 @@
|
|||
language: rust
|
||||
rust:
|
||||
- stable
|
||||
- beta
|
||||
- nightly
|
||||
matrix:
|
||||
allow_failures:
|
||||
- rust: nightly
|
|
@ -0,0 +1,45 @@
|
|||
If you send a pull request / patch, please observe the following.
|
||||
|
||||
## Licensing
|
||||
|
||||
Since this crate is dual-licensed,
|
||||
[section 5 of the Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0#contributions)
|
||||
is considered to apply in the sense of Contributions being automatically
|
||||
under the Apache License 2.0 or MIT dual license (see the `COPYRIGHT` file).
|
||||
That is, by the act of offering a Contribution, you place your Contribution
|
||||
under the Apache License 2.0 or MIT dual license stated in the `COPYRIGHT`
|
||||
file. Please do not contribute if you aren't willing or allowed to license your
|
||||
contributions in this manner.
|
||||
|
||||
You are encouraged to dedicate test code that you contribute to the Public
|
||||
Domain using the CC0 dedication. If you contribute test code that is not
|
||||
dedicated to the Public Domain, please be sure not to put it in a part of
|
||||
source code that the comments designate as being dedicated to the Public
|
||||
Domain.
|
||||
|
||||
## Copyright Notices
|
||||
|
||||
If you require the addition of your copyright notice, it's up to you to edit in
|
||||
your notice as part of your Contribution. Not adding a copyright notice is
|
||||
taken as a waiver of copyright notice.
|
||||
|
||||
## No Encodings Beyond The Encoding Standard
|
||||
|
||||
Please do not contribute implementations of encodings that are not specified
|
||||
in the [Encoding Standard](https://encoding.spec.whatwg.org/).
|
||||
|
||||
For example, an implementation of UTF-7 would be explicitly not welcome.
|
||||
|
||||
## Compatibility with Stable Rust
|
||||
|
||||
Please ensure that your Contribution compiles with the latest stable-channel
|
||||
rustc.
|
||||
|
||||
## rustfmt
|
||||
|
||||
Please install [`rustfmt`](https://github.com/rust-lang-nursery/rustfmt) and
|
||||
run `cargo fmt` before creating a pull request.
|
||||
|
||||
## Unit tests
|
||||
|
||||
Please ensure that `cargo test` succeeds.
|
|
@ -0,0 +1,26 @@
|
|||
encoding_rs is copyright 2013-2016 Mozilla Foundation.
|
||||
|
||||
Licensed under the Apache License, Version 2.0
|
||||
<LICENSE-APACHE or
|
||||
https://www.apache.org/licenses/LICENSE-2.0> or the MIT
|
||||
license <LICENSE-MIT or https://opensource.org/licenses/MIT>,
|
||||
at your option. All files in the project carrying such
|
||||
notice may not be copied, modified, or distributed except
|
||||
according to those terms.
|
||||
|
||||
Test code within encoding_rs is dedicated to the Public Domain when so
|
||||
designated (see the individual files for PD/CC0-dedicated sections).
|
||||
|
||||
The file utf_8_core.rs was extracted from the Rust project at revision
|
||||
7ad7232422f7e5bbfa0e52dabe36c12677df19e2, whose COPYRIGHT file said (in part):
|
||||
|
||||
The Rust Project is copyright 2010, The Rust Project
|
||||
Developers.
|
||||
|
||||
Licensed under the Apache License, Version 2.0
|
||||
<LICENSE-APACHE or
|
||||
http://www.apache.org/licenses/LICENSE-2.0> or the MIT
|
||||
license <LICENSE-MIT or http://opensource.org/licenses/MIT>,
|
||||
at your option. All files in the project carrying such
|
||||
notice may not be copied, modified, or distributed except
|
||||
according to those terms.
|
|
@ -0,0 +1,28 @@
|
|||
[package]
|
||||
name = "encoding_rs"
|
||||
description = "A Gecko-oriented implementation of the Encoding Standard"
|
||||
version = "0.6.11" # Remember to keep html_root_url in lib.rs in sync!
|
||||
authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
|
||||
license = "MIT/Apache-2.0"
|
||||
readme = "README.md"
|
||||
documentation = "https://docs.rs/encoding_rs/"
|
||||
homepage = "https://docs.rs/encoding_rs/"
|
||||
repository = "https://github.com/hsivonen/encoding_rs"
|
||||
keywords = ["encoding", "web", "unicode", "charset"]
|
||||
categories = ["text-processing", "encoding", "web-programming", "email"]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "hsivonen/encoding_rs" }
|
||||
|
||||
[features]
|
||||
simd-accel = ["simd"]
|
||||
no-static-ideograph-encoder-tables = []
|
||||
parallel-utf8 = ["rayon"]
|
||||
|
||||
[dependencies]
|
||||
cfg-if = "0.1.0"
|
||||
simd = { version = "0.2.0", optional = true }
|
||||
rayon = { version = "0.7.0", optional = true }
|
||||
|
||||
[profile.release]
|
||||
lto = true
|
|
@ -0,0 +1,77 @@
|
|||
This document contains notes about various ideas that for one reason or another
|
||||
are not being actively pursued.
|
||||
|
||||
## Next byte is non-ASCII after ASCII optimization
|
||||
|
||||
The current plan for a SIMD-accelerated inner loop for handling ASCII bytes
|
||||
makes no use of the bit of information that if the buffers didn't end but the
|
||||
ASCII loop exited, the next byte will not be an ASCII byte.
|
||||
|
||||
## The structure of handles.rs and bound checks
|
||||
|
||||
handles.rs is designed to make it possible to avoid bound checks when writing
|
||||
to the slices. While it would be possible to omit the bound checks manually,
|
||||
it probably makes more sense to carry out an investigation to make sure that
|
||||
the compiler performs the omission. If not, it makes more sense to file a bug
|
||||
on the compiler than to omit the checks manually.
|
||||
|
||||
## Handling ASCII with table lookups when decoding single-byte to UTF-16
|
||||
|
||||
Both uconv and ICU outperform encoding_rs when decoding single-byte to UTF-16.
|
||||
unconv doesn't even do anything fancy to manually unroll the loop (see below).
|
||||
Both handle even the ASCII range using table lookup. That is, there's no branch
|
||||
for checking if we're in the lower or upper half of the encoding.
|
||||
|
||||
However, adding SIMD acceleration for the ASCII half will likely be a bigger
|
||||
win than eliminating the branch to decide ASCII vs. non-ASCII.
|
||||
|
||||
## Manual loop unrolling for single-byte encodings
|
||||
|
||||
ICU currently outperforms encoding_rs (by over x2!) when decoding a single-byte
|
||||
encoding to UTF-16. This appears to be thanks to manually unrolling the
|
||||
conversion loop by 16. See [ucnv_MBCSSingleToBMPWithOffsets][1].
|
||||
|
||||
[1]: https://ssl.icu-project.org/repos/icu/icu/tags/release-55-1/source/common/ucnvmbcs.cpp
|
||||
|
||||
Notably, none of the single-byte encodings have bytes that'd decode to the
|
||||
upper half of BMP. Therefore, if the unmappable marker has the highest bit set
|
||||
instead of being zero, the check for unmappables within a 16-character stride
|
||||
can be done either by ORing the BMP characters in the stride together and
|
||||
checking the high bit or by loading the upper halves of the BMP charaters
|
||||
in a `u8x8` register and checking the high bits using the `_mm_movemask_epi8`
|
||||
/ `pmovmskb` SSE2 instruction.
|
||||
|
||||
## After non-ASCII, handle ASCII punctuation without SIMD
|
||||
|
||||
Since the failure mode of SIMD ASCII acceleration involves wasted aligment
|
||||
checks and a wasted SIMD read when the next code unit is non-ASCII and non-Latin
|
||||
scripts have runs of non-ASCII even if ASCII spaces and punctuation is used,
|
||||
consider handling the next two or three bytes following non-ASCII as non-SIMD
|
||||
before looping back to the SIMD mode. Maybe move back to SIMD ASCII faster if
|
||||
there's ASCII that's not space or punctuation. Maybe with the "space or
|
||||
punctuation" check in place, this code can be allowed to be in place even for
|
||||
UTF-8 and Latin single-byte (i.e. not having different code for Latin and
|
||||
non-Latin single-byte).
|
||||
|
||||
## Prefer maintaining aligment
|
||||
|
||||
Instead of returning to acceleration directly after non-ASCII, consider
|
||||
continuing to the alignment boundary without acceleration.
|
||||
|
||||
## Read from SIMD lanes instead of RAM (cache) when ASCII check fails
|
||||
|
||||
When the SIMD ASCII check fails, the data has already been read from memory.
|
||||
Test whether it's faster to read the data by lane from the SIMD register than
|
||||
to read it again from RAM (cache).
|
||||
|
||||
## Use Level 2 Hanzi and Level 2 Kanji ordering
|
||||
|
||||
These two are ordered by radical and then by stroke count, so in principle,
|
||||
they should be mostly Unicode-ordered, although at least Level 2 Hanzi isn't
|
||||
fully Unicode-ordered. Is "mostly" good enough for encode accelelation?
|
||||
|
||||
## Create a `divmod_94()` function
|
||||
|
||||
Experiment with a function that computes `(i / 94, i % 94)` more efficiently
|
||||
than generic code.
|
||||
|
|
@ -0,0 +1,202 @@
|
|||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -0,0 +1,54 @@
|
|||
Copyright (c) 2013-2016 Mozilla Foundation
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
The file utf_8_core.rs was extracted from the Rust project at revision
|
||||
7ad7232422f7e5bbfa0e52dabe36c12677df19e2, whose LICENSE-MIT file said:
|
||||
|
||||
Copyright (c) 2010 The Rust Project Developers
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
|
@ -0,0 +1,256 @@
|
|||
# encoding_rs
|
||||
|
||||
[![Build Status](https://travis-ci.org/hsivonen/encoding_rs.svg?branch=master)](https://travis-ci.org/hsivonen/encoding_rs)
|
||||
[![crates.io](https://meritbadge.herokuapp.com/encoding_rs)](https://crates.io/crates/encoding_rs)
|
||||
[![docs.rs](https://docs.rs/encoding_rs/badge.svg)](https://docs.rs/encoding_rs/)
|
||||
[![Apache 2 / MIT dual-licensed](https://img.shields.io/badge/license-Apache%202%20%2F%20MIT-blue.svg)](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT)
|
||||
|
||||
encoding_rs aspires to become an implementation of the
|
||||
[Encoding Standard](https://encoding.spec.whatwg.org/) that
|
||||
|
||||
1. Is written in Rust.
|
||||
2. Is suitable for use in Gecko as a replacement of uconv. (I.e. supports
|
||||
decoding to UTF-16 and encoding from UTF-16.)
|
||||
3. Is suitable for use in Rust code (both in Gecko and independently of Gecko).
|
||||
(I.e. supports decoding to UTF-8 and encoding from UTF-8 and provides an API
|
||||
compatible with at least the most common ways of using
|
||||
[rust-encoding](https://github.com/lifthrasiir/rust-encoding/).)
|
||||
|
||||
## Licensing
|
||||
|
||||
Please see the file named
|
||||
[COPYRIGHT](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT).
|
||||
|
||||
## API Documentation
|
||||
|
||||
Generated [API documentation](https://docs.rs/encoding_rs/) is available
|
||||
online.
|
||||
|
||||
## Design
|
||||
|
||||
For design considerations, please see the associated [technical proposal to
|
||||
rewrite uconv in Rust](https://docs.google.com/document/d/13GCbdvKi83a77ZcKOxaEteXp1SOGZ_9Fmztb9iX22v0/edit#).
|
||||
|
||||
## Performance goals
|
||||
|
||||
For decoding to UTF-16, the goal is to perform at least as well as Gecko's old
|
||||
uconv. For decoding to UTF-8, the goal is to perform at least as well as
|
||||
rust-encoding.
|
||||
|
||||
Encoding to UTF-8 should be fast. (UTF-8 to UTF-8 encode should be equivalent
|
||||
to `memcpy` and UTF-16 to UTF-8 should be fast.)
|
||||
|
||||
Speed is a non-goal when encoding to legacy encodings. Encoding to legacy
|
||||
encodings should not be optimized for speed at the expense of code size as long
|
||||
as form submission and URL parsing in Gecko don't become noticeably too slow
|
||||
in real-world use.
|
||||
|
||||
A framework for measuring performance is [available separately][1].
|
||||
|
||||
[1]: https://github.com/hsivonen/encoding_bench/
|
||||
|
||||
## C binding
|
||||
|
||||
An FFI layer for encoding_rs is available as a
|
||||
[separate crate](https://github.com/hsivonen/encoding_c).
|
||||
|
||||
## Compatibility with rust-encoding
|
||||
|
||||
A compatibility layer that implements the rust-encoding API on top of
|
||||
encoding_rs is
|
||||
[provided as a separate crate](https://github.com/hsivonen/encoding_rs_compat)
|
||||
(cannot be uploaded to crates.io).
|
||||
|
||||
## Roadmap
|
||||
|
||||
- [x] Design the low-level API.
|
||||
- [x] Provide Rust-only convenience features (some BOM sniffing variants still
|
||||
TODO).
|
||||
- [x] Provide an stl/gsl-flavored C++ API.
|
||||
- [x] Implement all decoders and encoders.
|
||||
- [x] Add unit tests for all decoders and encoders.
|
||||
- [x] Finish BOM sniffing variants in Rust-only convenience features.
|
||||
- [x] Document the API.
|
||||
- [x] Publish the crate on crates.io.
|
||||
- [x] Create a solution for measuring performance.
|
||||
- [x] Accelerate ASCII conversions using SSE2 on x86.
|
||||
- [x] Accelerate ASCII conversions using ALU register-sized operations on
|
||||
non-x86 architectures (process an `usize` instead of `u8` at a time).
|
||||
- [x] Split FFI into a separate crate so that the FFI doesn't interfere with
|
||||
LTO in pure-Rust usage.
|
||||
- [x] Compress CJK indices by making use of sequential code points as well
|
||||
as Unicode-ordered parts of indices.
|
||||
- [x] Make lookups by label or name use binary search that searches from the
|
||||
end of the label/name to the start.
|
||||
- [x] Make labels with non-ASCII bytes fail fast.
|
||||
- [x] Parallelize UTF-8 validation using [Rayon](https://github.com/nikomatsakis/rayon).
|
||||
- [x] Provide an XPCOM/MFBT-flavored C++ API.
|
||||
- [ ] Investigate accelerating single-byte encode with a single fast-tracked
|
||||
range per encoding.
|
||||
- [ ] Replace uconv with encoding_rs in Gecko.
|
||||
- [x] Implement the rust-encoding API in terms of encoding_rs.
|
||||
- [ ] Investigate the use of NEON on newer ARM CPUs that have a lesser penalty
|
||||
on data flow from NEON to ALU registers.
|
||||
- [ ] Investigate Björn Höhrmann's lookup table acceleration for UTF-8 as
|
||||
adapted to Rust in rust-encoding.
|
||||
|
||||
## Release Notes
|
||||
|
||||
### 0.6.11
|
||||
|
||||
* Make `Encoder::has_pending_state()` public.
|
||||
* Update the `simd` crate dependency to 0.2.0.
|
||||
|
||||
### 0.6.10
|
||||
|
||||
* Reserve enough space for NCRs when encoding to ISO-2022-JP.
|
||||
* Correct max length calculations for multibyte decoders.
|
||||
* Correct max length calculations before BOM sniffing has been
|
||||
performed.
|
||||
* Correctly calculate max length when encoding from UTF-16 to GBK.
|
||||
|
||||
### 0.6.9
|
||||
|
||||
* [Don't prepend anything when gb18030 range decode
|
||||
fails](https://github.com/whatwg/encoding/issues/110). (Spec change.)
|
||||
|
||||
### 0.6.8
|
||||
|
||||
* Correcly handle the case where the first buffer contains potentially
|
||||
partial BOM and the next buffer is the last buffer.
|
||||
* Decode byte `7F` correctly in ISO-2022-JP.
|
||||
* Make UTF-16 to UTF-8 encode write closer to the end of the buffer.
|
||||
* Implement `Hash` for `Encoding`.
|
||||
|
||||
### 0.6.7
|
||||
|
||||
* [Map half-width katakana to full-width katana in ISO-2022-JP
|
||||
encoder](https://github.com/whatwg/encoding/issues/105). (Spec change.)
|
||||
* Give `InputEmpty` correct precedence over `OutputFull` when encoding
|
||||
with replacement and the output buffer passed in is too short or the
|
||||
remaining space in the output buffer is too small after a replacement.
|
||||
|
||||
### 0.6.6
|
||||
|
||||
* Correct max length calculation when a partial BOM prefix is part of
|
||||
the decoder's state.
|
||||
|
||||
### 0.6.5
|
||||
|
||||
* Correct max length calculation in various encoders.
|
||||
* Correct max length calculation in the UTF-16 decoder.
|
||||
* Derive `PartialEq` and `Eq` for the `CoderResult`, `DecoderResult`
|
||||
and `EncoderResult` types.
|
||||
|
||||
### 0.6.4
|
||||
|
||||
* Avoid panic when encoding with replacement and the destination buffer is
|
||||
too short to hold one numeric character reference.
|
||||
|
||||
### 0.6.3
|
||||
|
||||
* Add support for 32-bit big-endian hosts. (For real this time.)
|
||||
|
||||
### 0.6.2
|
||||
|
||||
* Fix a panic from subslicing with bad indices in
|
||||
`Encoder::encode_from_utf16`. (Due to an oversight, it lacked the fix that
|
||||
`Encoder::encode_from_utf8` already had.)
|
||||
* Micro-optimize error status accumulation in non-streaming case.
|
||||
|
||||
### 0.6.1
|
||||
|
||||
* Avoid panic near integer overflow in a case that's unlikely to actually
|
||||
happen.
|
||||
* Address Clippy lints.
|
||||
|
||||
### 0.6.0
|
||||
|
||||
* Make the methods for computing worst-case buffer size requirements check
|
||||
for integer overflow.
|
||||
* Upgrade rayon to 0.7.0.
|
||||
|
||||
### 0.5.1
|
||||
|
||||
* Reorder methods for better documentation readability.
|
||||
* Add support for big-endian hosts. (Only 64-bit case actually tested.)
|
||||
* Optimize the ALU (non-SIMD) case for 32-bit ARM instead of x86_64.
|
||||
|
||||
### 0.5.0
|
||||
|
||||
* Avoid allocating an excessively long buffers in non-streaming decode.
|
||||
* Fix the behavior of ISO-2022-JP and replacement decoders near the end of the
|
||||
output buffer.
|
||||
* Annotate the result structs with `#[must_use]`.
|
||||
|
||||
### 0.4.0
|
||||
|
||||
* Split FFI into a separate crate.
|
||||
* Performance tweaks.
|
||||
* CJK binary size and encoding performance changes.
|
||||
* Parallelize UTF-8 validation in the case of long buffers (with optional
|
||||
feature `parallel-utf8`).
|
||||
* Borrow even with ISO-2022-JP when possible.
|
||||
|
||||
### 0.3.2
|
||||
|
||||
* Fix moving pointers to alignment in ALU-based ASCII acceleration.
|
||||
* Fix errors in documentation and improve documentation.
|
||||
|
||||
### 0.3.1
|
||||
|
||||
* Fix UTF-8 to UTF-16 decode for byte sequences beginning with 0xEE.
|
||||
* Make UTF-8 to UTF-8 decode SSE2-accelerated when feature `simd-accel` is used.
|
||||
* When decoding and encoding ASCII-only input from or to an ASCII-compatible
|
||||
encoding using the non-streaming API, return a borrow of the input.
|
||||
* Make encode from UTF-16 to UTF-8 faster.
|
||||
|
||||
### 0.3
|
||||
|
||||
* Change the references to the instances of `Encoding` from `const` to `static`
|
||||
to make the referents unique across crates that use the refernces.
|
||||
* Introduce non-reference-typed `FOO_INIT` instances of `Encoding` to allow
|
||||
foreign crates to initialize `static` arrays with references to `Encoding`
|
||||
instances even under Rust's constraints that prohibit the initialization of
|
||||
`&'static Encoding`-typed array items with `&'static Encoding`-typed
|
||||
`statics`.
|
||||
* Document that the above two points will be reverted if Rust changes `const`
|
||||
to work so that cross-crate usage keeps the referents unique.
|
||||
* Return `Cow`s from Rust-only non-streaming methods for encode and decode.
|
||||
* `Encoding::for_bom()` returns the length of the BOM.
|
||||
* ASCII-accelerated conversions for encodings other than UTF-16LE, UTF-16BE,
|
||||
ISO-2022-JP and x-user-defined.
|
||||
* Add SSE2 acceleration behind the `simd-accel` feature flag. (Requires
|
||||
nightly Rust.)
|
||||
* Fix panic with long bogus labels.
|
||||
* Map [0xCA to U+05BA in windows-1255](https://github.com/whatwg/encoding/issues/73).
|
||||
(Spec change.)
|
||||
* Correct the [end of the Shift_JIS EUDC range](https://github.com/whatwg/encoding/issues/53).
|
||||
(Spec change.)
|
||||
|
||||
### 0.2.4
|
||||
|
||||
* Polish FFI documentation.
|
||||
|
||||
### 0.2.3
|
||||
|
||||
* Fix UTF-16 to UTF-8 encode.
|
||||
|
||||
### 0.2.2
|
||||
|
||||
* Add `Encoder.encode_from_utf8_to_vec_without_replacement()`.
|
||||
|
||||
### 0.2.1
|
||||
|
||||
* Add `Encoding.is_ascii_compatible()`.
|
||||
|
||||
* Add `Encoding::for_bom()`.
|
||||
|
||||
* Make `==` for `Encoding` use name comparison instead of pointer comparison,
|
||||
because uses of the encoding constants in different crates result in
|
||||
different addresses and the constant cannot be turned into statics without
|
||||
breaking other things.
|
||||
|
||||
### 0.2.0
|
||||
|
||||
The initial release.
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,2 @@
|
|||
fn_call_style = "Block"
|
||||
error_on_line_overflow = false
|
|
@ -0,0 +1,847 @@
|
|||
// Copyright 2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// It's assumed that in due course Rust will have explicit SIMD but will not
|
||||
// be good at run-time selection of SIMD vs. no-SIMD. In such a future,
|
||||
// x86_64 will always use SSE2 and 32-bit x86 will use SSE2 when compiled with
|
||||
// a Mozilla-shipped rustc. SIMD support and especially detection on ARM is a
|
||||
// mess. Under the circumstances, it seems to make sense to optimize the ALU
|
||||
// case for ARMv7 rather than x86. Annoyingly, I was unable to get useful
|
||||
// numbers of the actual ARMv7 CPU I have access to, because (thermal?)
|
||||
// throttling kept interfering. Since Raspberry Pi 3 (ARMv8 core but running
|
||||
// ARMv7 code) produced reproducible performance numbers, that's the ARM
|
||||
// computer that this code ended up being optimized for in the ALU case.
|
||||
// Less popular CPU architectures simply get the approach that was chosen based
|
||||
// on Raspberry Pi 3 measurements. The UTF-16 and UTF-8 ALU cases take
|
||||
// different approaches based on benchmarking on Raspberry Pi 3.
|
||||
|
||||
#[cfg(feature = "simd-accel")]
|
||||
use simd_funcs::*;
|
||||
|
||||
macro_rules! ascii_naive {
|
||||
($name:ident,
|
||||
$src_unit:ty,
|
||||
$dst_unit:ty) => (
|
||||
#[inline(always)]
|
||||
pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) -> Option<($src_unit, usize)> {
|
||||
// Yes, manually omitting the bound check here matters
|
||||
// a lot for perf.
|
||||
for i in 0..len {
|
||||
let code_unit = *(src.offset(i as isize));
|
||||
if code_unit > 127 {
|
||||
return Some((code_unit, i));
|
||||
}
|
||||
*(dst.offset(i as isize)) = code_unit as $dst_unit;
|
||||
}
|
||||
return None;
|
||||
});
|
||||
}
|
||||
|
||||
macro_rules! ascii_alu {
|
||||
($name:ident,
|
||||
$src_unit:ty,
|
||||
$dst_unit:ty,
|
||||
$stride_fn:ident) => (
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(never_loop))]
|
||||
#[inline(always)]
|
||||
pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) -> Option<($src_unit, usize)> {
|
||||
let mut offset = 0usize;
|
||||
// This loop is only broken out of as a `goto` forward
|
||||
loop {
|
||||
let mut until_alignment = {
|
||||
// Check if the other unit aligns if we move the narrower unit
|
||||
// to alignment.
|
||||
// if ::std::mem::size_of::<$src_unit>() == ::std::mem::size_of::<$dst_unit>() {
|
||||
// ascii_to_ascii
|
||||
let src_alignment = (src as usize) & ALIGNMENT_MASK;
|
||||
let dst_alignment = (dst as usize) & ALIGNMENT_MASK;
|
||||
if src_alignment != dst_alignment {
|
||||
break;
|
||||
}
|
||||
(ALIGNMENT - src_alignment) & ALIGNMENT_MASK
|
||||
// } else if ::std::mem::size_of::<$src_unit>() < ::std::mem::size_of::<$dst_unit>() {
|
||||
// ascii_to_basic_latin
|
||||
// let src_until_alignment = (ALIGNMENT - ((src as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
|
||||
// if (dst.offset(src_until_alignment as isize) as usize) & ALIGNMENT_MASK != 0 {
|
||||
// break;
|
||||
// }
|
||||
// src_until_alignment
|
||||
// } else {
|
||||
// basic_latin_to_ascii
|
||||
// let dst_until_alignment = (ALIGNMENT - ((dst as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
|
||||
// if (src.offset(dst_until_alignment as isize) as usize) & ALIGNMENT_MASK != 0 {
|
||||
// break;
|
||||
// }
|
||||
// dst_until_alignment
|
||||
// }
|
||||
};
|
||||
if until_alignment + STRIDE_SIZE <= len {
|
||||
// Moving pointers to alignment seems to be a pessimization on
|
||||
// x86_64 for operations that have UTF-16 as the internal
|
||||
// Unicode representation. However, since it seems to be a win
|
||||
// on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
|
||||
// mixed results when encoding from UTF-16 and since x86 and
|
||||
// x86_64 should be using SSE2 in due course, keeping the move
|
||||
// to alignment here. It would be good to test on more ARM CPUs
|
||||
// and on real MIPS and POWER hardware.
|
||||
while until_alignment != 0 {
|
||||
let code_unit = *(src.offset(offset as isize));
|
||||
if code_unit > 127 {
|
||||
return Some((code_unit, offset));
|
||||
}
|
||||
*(dst.offset(offset as isize)) = code_unit as $dst_unit;
|
||||
offset += 1;
|
||||
until_alignment -= 1;
|
||||
}
|
||||
loop {
|
||||
if let Some(num_ascii) = $stride_fn(src.offset(offset as isize) as *const usize,
|
||||
dst.offset(offset as isize) as *mut usize) {
|
||||
offset += num_ascii;
|
||||
return Some((*(src.offset(offset as isize)), offset));
|
||||
}
|
||||
offset += STRIDE_SIZE;
|
||||
if offset + STRIDE_SIZE > len {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
while offset < len {
|
||||
let code_unit = *(src.offset(offset as isize));
|
||||
if code_unit > 127 {
|
||||
return Some((code_unit, offset));
|
||||
}
|
||||
*(dst.offset(offset as isize)) = code_unit as $dst_unit;
|
||||
offset += 1;
|
||||
}
|
||||
None
|
||||
});
|
||||
}
|
||||
|
||||
macro_rules! basic_latin_alu {
|
||||
($name:ident,
|
||||
$src_unit:ty,
|
||||
$dst_unit:ty,
|
||||
$stride_fn:ident) => (
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(never_loop))]
|
||||
#[inline(always)]
|
||||
pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) -> Option<($src_unit, usize)> {
|
||||
let mut offset = 0usize;
|
||||
// This loop is only broken out of as a `goto` forward
|
||||
loop {
|
||||
let mut until_alignment = {
|
||||
// Check if the other unit aligns if we move the narrower unit
|
||||
// to alignment.
|
||||
// if ::std::mem::size_of::<$src_unit>() == ::std::mem::size_of::<$dst_unit>() {
|
||||
// ascii_to_ascii
|
||||
// let src_alignment = (src as usize) & ALIGNMENT_MASK;
|
||||
// let dst_alignment = (dst as usize) & ALIGNMENT_MASK;
|
||||
// if src_alignment != dst_alignment {
|
||||
// break;
|
||||
// }
|
||||
// (ALIGNMENT - src_alignment) & ALIGNMENT_MASK
|
||||
// } else
|
||||
if ::std::mem::size_of::<$src_unit>() < ::std::mem::size_of::<$dst_unit>() {
|
||||
// ascii_to_basic_latin
|
||||
let src_until_alignment = (ALIGNMENT - ((src as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
|
||||
if (dst.offset(src_until_alignment as isize) as usize) & ALIGNMENT_MASK != 0 {
|
||||
break;
|
||||
}
|
||||
src_until_alignment
|
||||
} else {
|
||||
// basic_latin_to_ascii
|
||||
let dst_until_alignment = (ALIGNMENT - ((dst as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
|
||||
if (src.offset(dst_until_alignment as isize) as usize) & ALIGNMENT_MASK != 0 {
|
||||
break;
|
||||
}
|
||||
dst_until_alignment
|
||||
}
|
||||
};
|
||||
if until_alignment + STRIDE_SIZE <= len {
|
||||
// Moving pointers to alignment seems to be a pessimization on
|
||||
// x86_64 for operations that have UTF-16 as the internal
|
||||
// Unicode representation. However, since it seems to be a win
|
||||
// on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
|
||||
// mixed results when encoding from UTF-16 and since x86 and
|
||||
// x86_64 should be using SSE2 in due course, keeping the move
|
||||
// to alignment here. It would be good to test on more ARM CPUs
|
||||
// and on real MIPS and POWER hardware.
|
||||
while until_alignment != 0 {
|
||||
let code_unit = *(src.offset(offset as isize));
|
||||
if code_unit > 127 {
|
||||
return Some((code_unit, offset));
|
||||
}
|
||||
*(dst.offset(offset as isize)) = code_unit as $dst_unit;
|
||||
offset += 1;
|
||||
until_alignment -= 1;
|
||||
}
|
||||
loop {
|
||||
if !$stride_fn(src.offset(offset as isize) as *const usize,
|
||||
dst.offset(offset as isize) as *mut usize) {
|
||||
break;
|
||||
}
|
||||
offset += STRIDE_SIZE;
|
||||
if offset + STRIDE_SIZE > len {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
while offset < len {
|
||||
let code_unit = *(src.offset(offset as isize));
|
||||
if code_unit > 127 {
|
||||
return Some((code_unit, offset));
|
||||
}
|
||||
*(dst.offset(offset as isize)) = code_unit as $dst_unit;
|
||||
offset += 1;
|
||||
}
|
||||
None
|
||||
});
|
||||
}
|
||||
|
||||
macro_rules! ascii_simd {
|
||||
($name:ident,
|
||||
$src_unit:ty,
|
||||
$dst_unit:ty,
|
||||
$stride_both_aligned:ident,
|
||||
$stride_src_aligned:ident,
|
||||
$stride_dst_aligned:ident,
|
||||
$stride_neither_aligned:ident) => (
|
||||
#[inline(always)]
|
||||
pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) -> Option<($src_unit, usize)> {
|
||||
let mut offset = 0usize;
|
||||
// XXX should we have more branchy code to move the pointers to
|
||||
// alignment if they aren't aligned but could align after
|
||||
// processing a few code units?
|
||||
if STRIDE_SIZE <= len {
|
||||
// XXX Should we first process one stride unconditinoally as unaligned to
|
||||
// avoid the cost of the branchiness below if the first stride fails anyway?
|
||||
// XXX Should we just use unaligned SSE2 access unconditionally? It seems that
|
||||
// on Haswell, it would make sense to just use unaligned and not bother
|
||||
// checking. Need to benchmark older architectures before deciding.
|
||||
let dst_masked = (dst as usize) & ALIGNMENT_MASK;
|
||||
if ((src as usize) & ALIGNMENT_MASK) == 0 {
|
||||
if dst_masked == 0 {
|
||||
loop {
|
||||
if !$stride_both_aligned(src.offset(offset as isize),
|
||||
dst.offset(offset as isize)) {
|
||||
break;
|
||||
}
|
||||
offset += STRIDE_SIZE;
|
||||
if offset + STRIDE_SIZE > len {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
loop {
|
||||
if !$stride_src_aligned(src.offset(offset as isize),
|
||||
dst.offset(offset as isize)) {
|
||||
break;
|
||||
}
|
||||
offset += STRIDE_SIZE;
|
||||
if offset + STRIDE_SIZE > len {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if dst_masked == 0 {
|
||||
loop {
|
||||
if !$stride_dst_aligned(src.offset(offset as isize),
|
||||
dst.offset(offset as isize)) {
|
||||
break;
|
||||
}
|
||||
offset += STRIDE_SIZE;
|
||||
if offset + STRIDE_SIZE > len {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
loop {
|
||||
if !$stride_neither_aligned(src.offset(offset as isize),
|
||||
dst.offset(offset as isize)) {
|
||||
break;
|
||||
}
|
||||
offset += STRIDE_SIZE;
|
||||
if offset + STRIDE_SIZE > len {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
while offset < len {
|
||||
let code_unit = *(src.offset(offset as isize));
|
||||
if code_unit > 127 {
|
||||
return Some((code_unit, offset));
|
||||
}
|
||||
*(dst.offset(offset as isize)) = code_unit as $dst_unit;
|
||||
offset += 1;
|
||||
}
|
||||
None
|
||||
});
|
||||
}
|
||||
|
||||
macro_rules! ascii_to_ascii_simd_stride {
|
||||
($name:ident,
|
||||
$load:ident,
|
||||
$store:ident) => (
|
||||
#[inline(always)]
|
||||
pub unsafe fn $name(src: *const u8, dst: *mut u8) -> bool {
|
||||
let simd = $load(src);
|
||||
if !is_ascii(simd) {
|
||||
return false;
|
||||
}
|
||||
$store(dst, simd);
|
||||
true
|
||||
});
|
||||
}
|
||||
|
||||
macro_rules! ascii_to_basic_latin_simd_stride {
|
||||
($name:ident,
|
||||
$load:ident,
|
||||
$store:ident) => (
|
||||
#[inline(always)]
|
||||
pub unsafe fn $name(src: *const u8, dst: *mut u16) -> bool {
|
||||
let simd = $load(src);
|
||||
if !is_ascii(simd) {
|
||||
return false;
|
||||
}
|
||||
let (first, second) = unpack(simd);
|
||||
$store(dst, first);
|
||||
$store(dst.offset(8), second);
|
||||
true
|
||||
});
|
||||
}
|
||||
|
||||
macro_rules! basic_latin_to_ascii_simd_stride {
|
||||
($name:ident,
|
||||
$load:ident,
|
||||
$store:ident) => (
|
||||
#[inline(always)]
|
||||
pub unsafe fn $name(src: *const u16, dst: *mut u8) -> bool {
|
||||
let first = $load(src);
|
||||
let second = $load(src.offset(8));
|
||||
match pack_basic_latin(first, second) {
|
||||
Some(packed) => {
|
||||
$store(dst, packed);
|
||||
true
|
||||
},
|
||||
None => false,
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] {
|
||||
// SIMD
|
||||
|
||||
pub const STRIDE_SIZE: usize = 16;
|
||||
|
||||
const ALIGNMENT_MASK: usize = 15;
|
||||
|
||||
ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_both_aligned, load16_aligned, store16_aligned);
|
||||
ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_src_aligned, load16_aligned, store16_unaligned);
|
||||
ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_dst_aligned, load16_unaligned, store16_aligned);
|
||||
ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
|
||||
|
||||
ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_both_aligned, load16_aligned, store8_aligned);
|
||||
ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_src_aligned, load16_aligned, store8_unaligned);
|
||||
ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_dst_aligned, load16_unaligned, store8_aligned);
|
||||
ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
|
||||
|
||||
basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_both_aligned, load8_aligned, store16_aligned);
|
||||
basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_src_aligned, load8_aligned, store16_unaligned);
|
||||
basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_dst_aligned, load8_unaligned, store16_aligned);
|
||||
basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
|
||||
|
||||
ascii_simd!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_both_aligned, ascii_to_ascii_stride_src_aligned, ascii_to_ascii_stride_dst_aligned, ascii_to_ascii_stride_neither_aligned);
|
||||
ascii_simd!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_both_aligned, ascii_to_basic_latin_stride_src_aligned, ascii_to_basic_latin_stride_dst_aligned, ascii_to_basic_latin_stride_neither_aligned);
|
||||
ascii_simd!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_both_aligned, basic_latin_to_ascii_stride_src_aligned, basic_latin_to_ascii_stride_dst_aligned, basic_latin_to_ascii_stride_neither_aligned);
|
||||
} else if #[cfg(all(target_endian = "little", target_pointer_width = "64"))] {
|
||||
// Aligned ALU word, little-endian, 64-bit
|
||||
|
||||
pub const STRIDE_SIZE: usize = 16;
|
||||
|
||||
const ALIGNMENT: usize = 8;
|
||||
|
||||
const ALIGNMENT_MASK: usize = 7;
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn ascii_to_basic_latin_stride_little_64(src: *const usize, dst: *mut usize) -> bool {
|
||||
let word = *src;
|
||||
let second_word = *(src.offset(1));
|
||||
// Check if the words contains non-ASCII
|
||||
if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 {
|
||||
return false;
|
||||
}
|
||||
let first = ((0x00000000_FF000000usize & word) << 24) |
|
||||
((0x00000000_00FF0000usize & word) << 16) |
|
||||
((0x00000000_0000FF00usize & word) << 8) |
|
||||
(0x00000000_000000FFusize & word);
|
||||
let second = ((0xFF000000_00000000usize & word) >> 8) |
|
||||
((0x00FF0000_00000000usize & word) >> 16) |
|
||||
((0x0000FF00_00000000usize & word) >> 24) |
|
||||
((0x000000FF_00000000usize & word) >> 32);
|
||||
let third = ((0x00000000_FF000000usize & second_word) << 24) |
|
||||
((0x00000000_00FF0000usize & second_word) << 16) |
|
||||
((0x00000000_0000FF00usize & second_word) << 8) |
|
||||
(0x00000000_000000FFusize & second_word);
|
||||
let fourth = ((0xFF000000_00000000usize & second_word) >> 8) |
|
||||
((0x00FF0000_00000000usize & second_word) >> 16) |
|
||||
((0x0000FF00_00000000usize & second_word) >> 24) |
|
||||
((0x000000FF_00000000usize & second_word) >> 32);
|
||||
*dst = first;
|
||||
*(dst.offset(1)) = second;
|
||||
*(dst.offset(2)) = third;
|
||||
*(dst.offset(3)) = fourth;
|
||||
true
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn basic_latin_to_ascii_stride_little_64(src: *const usize, dst: *mut usize) -> bool {
|
||||
let first = *src;
|
||||
let second = *(src.offset(1));
|
||||
let third = *(src.offset(2));
|
||||
let fourth = *(src.offset(3));
|
||||
if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 {
|
||||
return false;
|
||||
}
|
||||
let word = ((0x00FF0000_00000000usize & second) << 8) |
|
||||
((0x000000FF_00000000usize & second) << 16) |
|
||||
((0x00000000_00FF0000usize & second) << 24) |
|
||||
((0x00000000_000000FFusize & second) << 32) |
|
||||
((0x00FF0000_00000000usize & first) >> 24) |
|
||||
((0x000000FF_00000000usize & first) >> 16) |
|
||||
((0x00000000_00FF0000usize & first) >> 8) |
|
||||
(0x00000000_000000FFusize & first);
|
||||
let second_word = ((0x00FF0000_00000000usize & fourth) << 8) |
|
||||
((0x000000FF_00000000usize & fourth) << 16) |
|
||||
((0x00000000_00FF0000usize & fourth) << 24) |
|
||||
((0x00000000_000000FFusize & fourth) << 32) |
|
||||
((0x00FF0000_00000000usize & third) >> 24) |
|
||||
((0x000000FF_00000000usize & third) >> 16) |
|
||||
((0x00000000_00FF0000usize & third) >> 8) |
|
||||
(0x00000000_000000FFusize & third);
|
||||
*dst = word;
|
||||
*(dst.offset(1)) = second_word;
|
||||
true
|
||||
}
|
||||
|
||||
basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_little_64);
|
||||
basic_latin_alu!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_little_64);
|
||||
} else if #[cfg(all(target_endian = "little", target_pointer_width = "32"))] {
|
||||
// Aligned ALU word, little-endian, 32-bit
|
||||
|
||||
pub const STRIDE_SIZE: usize = 8;
|
||||
|
||||
const ALIGNMENT: usize = 4;
|
||||
|
||||
const ALIGNMENT_MASK: usize = 3;
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn ascii_to_basic_latin_stride_little_32(src: *const usize, dst: *mut usize) -> bool {
|
||||
let word = *src;
|
||||
let second_word = *(src.offset(1));
|
||||
// Check if the words contains non-ASCII
|
||||
if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 {
|
||||
return false;
|
||||
}
|
||||
let first = ((0x0000FF00usize & word) << 8) |
|
||||
(0x000000FFusize & word);
|
||||
let second = ((0xFF000000usize & word) >> 8) |
|
||||
((0x00FF0000usize & word) >> 16);
|
||||
let third = ((0x0000FF00usize & second_word) << 8) |
|
||||
(0x000000FFusize & second_word);
|
||||
let fourth = ((0xFF000000usize & second_word) >> 8) |
|
||||
((0x00FF0000usize & second_word) >> 16);
|
||||
*dst = first;
|
||||
*(dst.offset(1)) = second;
|
||||
*(dst.offset(2)) = third;
|
||||
*(dst.offset(3)) = fourth;
|
||||
return true;
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn basic_latin_to_ascii_stride_little_32(src: *const usize, dst: *mut usize) -> bool {
|
||||
let first = *src;
|
||||
let second = *(src.offset(1));
|
||||
let third = *(src.offset(2));
|
||||
let fourth = *(src.offset(3));
|
||||
if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 {
|
||||
return false;
|
||||
}
|
||||
let word = ((0x00FF0000usize & second) << 8) |
|
||||
((0x000000FFusize & second) << 16) |
|
||||
((0x00FF0000usize & first) >> 8) |
|
||||
(0x000000FFusize & first);
|
||||
let second_word = ((0x00FF0000usize & fourth) << 8) |
|
||||
((0x000000FFusize & fourth) << 16) |
|
||||
((0x00FF0000usize & third) >> 8) |
|
||||
(0x000000FFusize & third);
|
||||
*dst = word;
|
||||
*(dst.offset(1)) = second_word;
|
||||
return true;
|
||||
}
|
||||
|
||||
basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_little_32);
|
||||
basic_latin_alu!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_little_32);
|
||||
} else if #[cfg(all(target_endian = "big", target_pointer_width = "64"))] {
|
||||
// Aligned ALU word, big-endian, 64-bit
|
||||
|
||||
pub const STRIDE_SIZE: usize = 16;
|
||||
|
||||
const ALIGNMENT: usize = 8;
|
||||
|
||||
const ALIGNMENT_MASK: usize = 7;
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn ascii_to_basic_latin_stride_big_64(src: *const usize, dst: *mut usize) -> bool {
|
||||
let word = *src;
|
||||
let second_word = *(src.offset(1));
|
||||
// Check if the words contains non-ASCII
|
||||
if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 {
|
||||
return false;
|
||||
}
|
||||
let first = ((0xFF000000_00000000usize & word) >> 8) |
|
||||
((0x00FF0000_00000000usize & word) >> 16) |
|
||||
((0x0000FF00_00000000usize & word) >> 24) |
|
||||
((0x000000FF_00000000usize & word) >> 32);
|
||||
let second = ((0x00000000_FF000000usize & word) << 24) |
|
||||
((0x00000000_00FF0000usize & word) << 16) |
|
||||
((0x00000000_0000FF00usize & word) << 8) |
|
||||
(0x00000000_000000FFusize & word);
|
||||
let third = ((0xFF000000_00000000usize & second_word) >> 8) |
|
||||
((0x00FF0000_00000000usize & second_word) >> 16) |
|
||||
((0x0000FF00_00000000usize & second_word) >> 24) |
|
||||
((0x000000FF_00000000usize & second_word) >> 32);
|
||||
let fourth = ((0x00000000_FF000000usize & second_word) << 24) |
|
||||
((0x00000000_00FF0000usize & second_word) << 16) |
|
||||
((0x00000000_0000FF00usize & second_word) << 8) |
|
||||
(0x00000000_000000FFusize & second_word);
|
||||
*dst = first;
|
||||
*(dst.offset(1)) = second;
|
||||
*(dst.offset(2)) = third;
|
||||
*(dst.offset(3)) = fourth;
|
||||
return true;
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn basic_latin_to_ascii_stride_big_64(src: *const usize, dst: *mut usize) -> bool {
|
||||
let first = *src;
|
||||
let second = *(src.offset(1));
|
||||
let third = *(src.offset(2));
|
||||
let fourth = *(src.offset(3));
|
||||
if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 {
|
||||
return false;
|
||||
}
|
||||
let word = ((0x00FF0000_00000000usize & first) << 8) |
|
||||
((0x000000FF_00000000usize & first) << 16) |
|
||||
((0x00000000_00FF0000usize & first) << 24) |
|
||||
((0x00000000_000000FFusize & first) << 32) |
|
||||
((0x00FF0000_00000000usize & second) >> 24) |
|
||||
((0x000000FF_00000000usize & second) >> 16) |
|
||||
((0x00000000_00FF0000usize & second) >> 8) |
|
||||
(0x00000000_000000FFusize & second);
|
||||
let second_word = ((0x00FF0000_00000000usize & third) << 8) |
|
||||
((0x000000FF_00000000usize & third) << 16) |
|
||||
((0x00000000_00FF0000usize & third) << 24) |
|
||||
((0x00000000_000000FFusize & third) << 32) |
|
||||
((0x00FF0000_00000000usize & fourth) >> 24) |
|
||||
((0x000000FF_00000000usize & fourth) >> 16) |
|
||||
((0x00000000_00FF0000usize & fourth) >> 8) |
|
||||
(0x00000000_000000FFusize & fourth);
|
||||
*dst = word;
|
||||
*(dst.offset(1)) = second_word;
|
||||
return true;
|
||||
}
|
||||
|
||||
basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_big_64);
|
||||
basic_latin_alu!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_big_64);
|
||||
} else if #[cfg(all(target_endian = "big", target_pointer_width = "32"))] {
|
||||
// Aligned ALU word, big-endian, 32-bit
|
||||
|
||||
pub const STRIDE_SIZE: usize = 8;
|
||||
|
||||
const ALIGNMENT: usize = 4;
|
||||
|
||||
const ALIGNMENT_MASK: usize = 3;
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn ascii_to_basic_latin_stride_big_32(src: *const usize, dst: *mut usize) -> bool {
|
||||
let word = *src;
|
||||
let second_word = *(src.offset(1));
|
||||
// Check if the words contains non-ASCII
|
||||
if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 {
|
||||
return false;
|
||||
}
|
||||
let first = ((0xFF000000usize & word) >> 8) |
|
||||
((0x00FF0000usize & word) >> 16);
|
||||
let second = ((0x0000FF00usize & word) << 8) |
|
||||
(0x000000FFusize & word);
|
||||
let third = ((0xFF000000usize & second_word) >> 8) |
|
||||
((0x00FF0000usize & second_word) >> 16);
|
||||
let fourth = ((0x0000FF00usize & second_word) << 8) |
|
||||
(0x000000FFusize & second_word);
|
||||
*dst = first;
|
||||
*(dst.offset(1)) = second;
|
||||
*(dst.offset(2)) = third;
|
||||
*(dst.offset(3)) = fourth;
|
||||
return true;
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn basic_latin_to_ascii_stride_big_32(src: *const usize, dst: *mut usize) -> bool {
|
||||
let first = *src;
|
||||
let second = *(src.offset(1));
|
||||
let third = *(src.offset(2));
|
||||
let fourth = *(src.offset(3));
|
||||
if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 {
|
||||
return false;
|
||||
}
|
||||
let word = ((0x00FF0000usize & first) << 8) |
|
||||
((0x000000FFusize & first) << 16) |
|
||||
((0x00FF0000usize & second) >> 8) |
|
||||
(0x000000FFusize & second);
|
||||
let second_word = ((0x00FF0000usize & third) << 8) |
|
||||
((0x000000FFusize & third) << 16) |
|
||||
((0x00FF0000usize & fourth) >> 8) |
|
||||
(0x000000FFusize & fourth);
|
||||
*dst = word;
|
||||
*(dst.offset(1)) = second_word;
|
||||
return true;
|
||||
}
|
||||
|
||||
basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_big_32);
|
||||
basic_latin_alu!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_big_32);
|
||||
} else {
|
||||
ascii_naive!(ascii_to_ascii, u8, u8);
|
||||
ascii_naive!(ascii_to_basic_latin, u8, u16);
|
||||
ascii_naive!(basic_latin_to_ascii, u16, u8);
|
||||
}
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] {
|
||||
} else if #[cfg(target_endian = "little")] {
|
||||
#[inline(always)]
|
||||
fn count_zeros(word: usize) -> u32 {
|
||||
word.trailing_zeros()
|
||||
}
|
||||
} else {
|
||||
#[inline(always)]
|
||||
fn count_zeros(word: usize) -> u32 {
|
||||
word.leading_zeros()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] {
|
||||
#[inline(always)]
|
||||
pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
|
||||
let src = slice.as_ptr();
|
||||
let len = slice.len();
|
||||
let mut offset = 0usize;
|
||||
if STRIDE_SIZE <= len {
|
||||
// XXX Should we first process one stride unconditionally as unaligned to
|
||||
// avoid the cost of the branchiness below if the first stride fails anyway?
|
||||
// XXX Should we just use unaligned SSE2 access unconditionally? It seems that
|
||||
// on Haswell, it would make sense to just use unaligned and not bother
|
||||
// checking. Need to benchmark older architectures before deciding.
|
||||
if ((src as usize) & ALIGNMENT_MASK) == 0 {
|
||||
loop {
|
||||
let simd = unsafe { load16_aligned(src.offset(offset as isize)) };
|
||||
if let Some(consumed) = check_ascii(simd) {
|
||||
offset += consumed;
|
||||
let non_ascii = unsafe { *src.offset(offset as isize) };
|
||||
return Some((non_ascii, offset));
|
||||
}
|
||||
offset += STRIDE_SIZE;
|
||||
if offset + STRIDE_SIZE > len {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
loop {
|
||||
let simd = unsafe { load16_unaligned(src.offset(offset as isize)) };
|
||||
if let Some(consumed) = check_ascii(simd) {
|
||||
offset += consumed;
|
||||
let non_ascii = unsafe { *src.offset(offset as isize) };
|
||||
return Some((non_ascii, offset));
|
||||
}
|
||||
offset += STRIDE_SIZE;
|
||||
if offset + STRIDE_SIZE > len {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
while offset < len {
|
||||
let code_unit = slice[offset];
|
||||
if code_unit > 127 {
|
||||
return Some((code_unit, offset));
|
||||
}
|
||||
offset += 1;
|
||||
}
|
||||
None
|
||||
}
|
||||
} else {
|
||||
// `as` truncates, so works on 32-bit, too.
|
||||
const ASCII_MASK: usize = 0x80808080_80808080u64 as usize;
|
||||
const BASIC_LATIN_MASK: usize = 0xFF80FF80_FF80FF80u64 as usize;
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn ascii_to_ascii_stride(src: *const usize, dst: *mut usize) -> Option<usize> {
|
||||
let word = *src;
|
||||
let second_word = *(src.offset(1));
|
||||
*dst = word;
|
||||
*(dst.offset(1)) = second_word;
|
||||
find_non_ascii(word, second_word)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn validate_ascii_stride(src: *const usize) -> Option<usize> {
|
||||
let word = *src;
|
||||
let second_word = *(src.offset(1));
|
||||
find_non_ascii(word, second_word)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn find_non_ascii(word: usize, second_word: usize) -> Option<usize> {
|
||||
let word_masked = word & ASCII_MASK;
|
||||
let second_masked = second_word & ASCII_MASK;
|
||||
if (word_masked | second_masked) == 0 {
|
||||
return None;
|
||||
}
|
||||
if word_masked != 0 {
|
||||
let zeros = count_zeros(word_masked);
|
||||
// `zeros` now contains 7 (for the seven bits of non-ASCII)
|
||||
// plus 8 times the number of ASCII in text order before the
|
||||
// non-ASCII byte in the little-endian case or 8 times the number of ASCII in
|
||||
// text order before the non-ASCII byte in the big-endian case.
|
||||
let num_ascii = (zeros >> 3) as usize;
|
||||
return Some(num_ascii);
|
||||
}
|
||||
let zeros = count_zeros(second_masked);
|
||||
// `zeros` now contains 7 (for the seven bits of non-ASCII)
|
||||
// plus 8 times the number of ASCII in text order before the
|
||||
// non-ASCII byte in the little-endian case or 8 times the number of ASCII in
|
||||
// text order before the non-ASCII byte in the big-endian case.
|
||||
let num_ascii = (zeros >> 3) as usize;
|
||||
Some(ALIGNMENT + num_ascii)
|
||||
}
|
||||
|
||||
ascii_alu!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride);
|
||||
|
||||
#[inline(always)]
|
||||
pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
|
||||
let src = slice.as_ptr();
|
||||
let len = slice.len();
|
||||
let mut offset = 0usize;
|
||||
let mut until_alignment = (ALIGNMENT - ((src as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
|
||||
if until_alignment + STRIDE_SIZE <= len {
|
||||
while until_alignment != 0 {
|
||||
let code_unit = slice[offset];
|
||||
if code_unit > 127 {
|
||||
return Some((code_unit, offset));
|
||||
}
|
||||
offset += 1;
|
||||
until_alignment -= 1;
|
||||
}
|
||||
loop {
|
||||
let ptr = unsafe { src.offset(offset as isize) as *const usize };
|
||||
if let Some(num_ascii) = unsafe { validate_ascii_stride(ptr) } {
|
||||
offset += num_ascii;
|
||||
return Some((unsafe { *(src.offset(offset as isize)) }, offset));
|
||||
}
|
||||
offset += STRIDE_SIZE;
|
||||
if offset + STRIDE_SIZE > len {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
while offset < len {
|
||||
let code_unit = slice[offset];
|
||||
if code_unit > 127 {
|
||||
return Some((code_unit, offset));
|
||||
}
|
||||
offset += 1;
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
|
||||
match validate_ascii(bytes) {
|
||||
None => bytes.len(),
|
||||
Some((_, num_valid)) => num_valid,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
|
||||
for (i, b_ref) in bytes.iter().enumerate() {
|
||||
let b = *b_ref;
|
||||
if b >= 0x80 || b == 0x1B || b == 0x0E || b == 0x0F {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
bytes.len()
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
macro_rules! test_ascii {
|
||||
($test_name:ident,
|
||||
$fn_tested:ident,
|
||||
$src_unit:ty,
|
||||
$dst_unit:ty) => (
|
||||
#[test]
|
||||
fn $test_name() {
|
||||
let mut src: Vec<$src_unit> = Vec::with_capacity(32);
|
||||
let mut dst: Vec<$dst_unit> = Vec::with_capacity(32);
|
||||
for i in 0..32 {
|
||||
src.clear();
|
||||
dst.clear();
|
||||
dst.resize(32, 0);
|
||||
for j in 0..32 {
|
||||
let c = if i == j {
|
||||
0xAA
|
||||
} else {
|
||||
j + 0x40
|
||||
};
|
||||
src.push(c as $src_unit);
|
||||
}
|
||||
match unsafe { $fn_tested(src.as_ptr(), dst.as_mut_ptr(), 32) } {
|
||||
None => unreachable!("Should always find non-ASCII"),
|
||||
Some((non_ascii, num_ascii)) => {
|
||||
assert_eq!(non_ascii, 0xAA);
|
||||
assert_eq!(num_ascii, i);
|
||||
for j in 0..i {
|
||||
assert_eq!(dst[j], (j + 0x40) as $dst_unit);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
test_ascii!(test_ascii_to_ascii, ascii_to_ascii, u8, u8);
|
||||
test_ascii!(test_ascii_to_basic_latin, ascii_to_basic_latin, u8, u16);
|
||||
test_ascii!(test_basic_latin_to_ascii, basic_latin_to_ascii, u16, u8);
|
||||
}
|
|
@ -0,0 +1,393 @@
|
|||
// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use handles::*;
|
||||
use data::*;
|
||||
use variant::*;
|
||||
use super::*;
|
||||
// Rust 1.14.0 requires the following despite the asterisk above.
|
||||
use super::in_inclusive_range32;
|
||||
|
||||
pub struct Big5Decoder {
|
||||
lead: Option<u8>,
|
||||
}
|
||||
|
||||
impl Big5Decoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::Big5(Big5Decoder { lead: None })
|
||||
}
|
||||
|
||||
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_add(
|
||||
match self.lead {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
// If there is a lead but the next byte isn't a valid trail, an
|
||||
// error is generated for the lead (+1). Then another iteration checks
|
||||
// space, which needs +1 to account for the possibility of astral
|
||||
// output or combining pair.
|
||||
checked_add(1, self.plus_one_if_lead(byte_length))
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
// No need to account for REPLACEMENT CHARACTERS.
|
||||
// Cases:
|
||||
// ASCII: 1 to 1
|
||||
// Valid pair: 2 to 2, 2 to 3 or 2 to 4, i.e. worst case 2 to 4
|
||||
// lead set and first byte is trail: 1 to 4 worst case
|
||||
//
|
||||
// When checking for space for the last byte:
|
||||
// no lead: the last byte must be ASCII (or fatal error): 1 to 1
|
||||
// lead set: space for 4 bytes was already checked when reading the
|
||||
// lead, hence the last lead and the last trail together are worst
|
||||
// case 2 to 4.
|
||||
//
|
||||
// If lead set and the input is a single trail byte, the worst-case
|
||||
// output is 4, so we need to add one before multiplying if lead is
|
||||
// set.
|
||||
//
|
||||
// Finally, add two so that if input is non-zero, the output is at
|
||||
// least 4.
|
||||
checked_add(2, checked_mul(2, self.plus_one_if_lead(byte_length)))
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
// If there is a lead but the next byte isn't a valid trail, an
|
||||
// error is generated for the lead (+(1*3)). Then another iteration
|
||||
// checks space, which needs +3 to account for the possibility of astral
|
||||
// output or combining pair. In between start and end, the worst case
|
||||
// is that every byte is bad: *3.
|
||||
checked_add(3, checked_mul(3, self.plus_one_if_lead(byte_length)))
|
||||
}
|
||||
|
||||
ascii_compatible_two_byte_decoder_functions!(
|
||||
{
|
||||
// If lead is between 0x81 and 0xFE, inclusive,
|
||||
// subtract offset 0x81.
|
||||
let non_ascii_minus_offset =
|
||||
non_ascii.wrapping_sub(0x81);
|
||||
if non_ascii_minus_offset > (0xFE - 0x81) {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
non_ascii_minus_offset
|
||||
},
|
||||
{
|
||||
// If trail is between 0x40 and 0x7E, inclusive,
|
||||
// subtract offset 0x40. Else if trail is
|
||||
// between 0xA1 and 0xFE, inclusive, subtract
|
||||
// offset 0x62.
|
||||
// TODO: Find out which range is more probable.
|
||||
let mut trail_minus_offset =
|
||||
byte.wrapping_sub(0x40);
|
||||
if trail_minus_offset > (0x7E - 0x40) {
|
||||
let trail_minus_range_start =
|
||||
byte.wrapping_sub(0xA1);
|
||||
if trail_minus_range_start >
|
||||
(0xFE - 0xA1) {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
trail_minus_offset = byte - 0x62;
|
||||
}
|
||||
let pointer = lead_minus_offset as usize *
|
||||
157usize +
|
||||
trail_minus_offset as usize;
|
||||
let rebased_pointer = pointer.wrapping_sub(942);
|
||||
let low_bits = big5_low_bits(rebased_pointer);
|
||||
if low_bits == 0 {
|
||||
match pointer {
|
||||
1133 => {
|
||||
handle.write_big5_combination(0x00CAu16,
|
||||
0x0304u16)
|
||||
}
|
||||
1135 => {
|
||||
handle.write_big5_combination(0x00CAu16,
|
||||
0x030Cu16)
|
||||
}
|
||||
1164 => {
|
||||
handle.write_big5_combination(0x00EAu16,
|
||||
0x0304u16)
|
||||
}
|
||||
1166 => {
|
||||
handle.write_big5_combination(0x00EAu16,
|
||||
0x030Cu16)
|
||||
}
|
||||
_ => {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
}
|
||||
} else if big5_is_astral(rebased_pointer) {
|
||||
handle.write_astral(low_bits as u32 |
|
||||
0x20000u32)
|
||||
} else {
|
||||
handle.write_bmp_excl_ascii(low_bits)
|
||||
}
|
||||
},
|
||||
self,
|
||||
non_ascii,
|
||||
byte,
|
||||
lead_minus_offset,
|
||||
unread_handle_trail,
|
||||
source,
|
||||
handle,
|
||||
'outermost,
|
||||
copy_ascii_from_check_space_astral,
|
||||
check_space_astral,
|
||||
false);
|
||||
}
|
||||
|
||||
pub struct Big5Encoder;
|
||||
|
||||
impl Big5Encoder {
|
||||
pub fn new(encoding: &'static Encoding) -> Encoder {
|
||||
Encoder::new(encoding, VariantEncoder::Big5(Big5Encoder))
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(&self,
|
||||
u16_length: usize)
|
||||
-> Option<usize> {
|
||||
// Astral: 2 to 2
|
||||
// ASCII: 1 to 1
|
||||
// Other: 1 to 2
|
||||
u16_length.checked_mul(2)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(&self,
|
||||
byte_length: usize)
|
||||
-> Option<usize> {
|
||||
// Astral: 4 to 2
|
||||
// Upper BMP: 3 to 2
|
||||
// Lower BMP: 2 to 2
|
||||
// ASCII: 1 to 1
|
||||
byte_length.checked_add(1)
|
||||
}
|
||||
|
||||
ascii_compatible_encoder_functions!(
|
||||
{
|
||||
// For simplicity, unified ideographs
|
||||
// in the pointer range 11206...11212 are handled
|
||||
// as Level 1 Hanzi.
|
||||
if let Some((lead, trail)) = big5_level1_hanzi_encode(bmp) {
|
||||
handle.write_two(lead, trail)
|
||||
} else {
|
||||
let pointer = if let Some(pointer) = big5_box_encode(bmp) {
|
||||
pointer
|
||||
} else if let Some(pointer) = big5_other_encode(bmp) {
|
||||
pointer
|
||||
} else {
|
||||
return (EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
};
|
||||
let lead = pointer / 157 + 0x81;
|
||||
let remainder = pointer % 157;
|
||||
let trail = if remainder < 0x3F {
|
||||
remainder + 0x40
|
||||
} else {
|
||||
remainder + 0x62
|
||||
};
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
}
|
||||
},
|
||||
{
|
||||
if in_inclusive_range32(astral as u32, 0x2008A, 0x2F8A6) {
|
||||
if let Some(rebased_pointer) = big5_astral_encode(astral as u16) {
|
||||
// big5_astral_encode returns rebased pointer,
|
||||
// so adding 0x87 instead of 0x81.
|
||||
let lead = rebased_pointer / 157 + 0x87;
|
||||
let remainder = rebased_pointer % 157;
|
||||
let trail = if remainder < 0x3F {
|
||||
remainder + 0x40
|
||||
} else {
|
||||
remainder + 0x62
|
||||
};
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else {
|
||||
return (EncoderResult::Unmappable(astral), source.consumed(), handle.written());
|
||||
}
|
||||
} else {
|
||||
return (EncoderResult::Unmappable(astral), source.consumed(), handle.written());
|
||||
}
|
||||
},
|
||||
bmp,
|
||||
astral,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_two,
|
||||
check_space_two,
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_big5(bytes: &[u8], expect: &str) {
|
||||
decode(BIG5, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_big5(string: &str, expect: &[u8]) {
|
||||
encode(BIG5, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_big5_decode() {
|
||||
// Empty
|
||||
decode_big5(b"", &"");
|
||||
|
||||
// ASCII
|
||||
decode_big5(&[0x61u8, 0x62u8], &"\u{0061}\u{0062}");
|
||||
|
||||
// Edge cases
|
||||
decode_big5(&[0x87u8, 0x40u8], &"\u{43F0}");
|
||||
decode_big5(&[0xFEu8, 0xFEu8], &"\u{79D4}");
|
||||
decode_big5(&[0xFEu8, 0xFDu8], &"\u{2910D}");
|
||||
decode_big5(&[0x88u8, 0x62u8], &"\u{00CA}\u{0304}");
|
||||
decode_big5(&[0x88u8, 0x64u8], &"\u{00CA}\u{030C}");
|
||||
decode_big5(&[0x88u8, 0x66u8], &"\u{00CA}");
|
||||
decode_big5(&[0x88u8, 0xA3u8], &"\u{00EA}\u{0304}");
|
||||
decode_big5(&[0x88u8, 0xA5u8], &"\u{00EA}\u{030C}");
|
||||
decode_big5(&[0x88u8, 0xA7u8], &"\u{00EA}");
|
||||
decode_big5(&[0x99u8, 0xD4u8], &"\u{8991}");
|
||||
decode_big5(&[0x99u8, 0xD5u8], &"\u{27967}");
|
||||
decode_big5(&[0x99u8, 0xD6u8], &"\u{8A29}");
|
||||
|
||||
// Edge cases surrounded with ASCII
|
||||
decode_big5(
|
||||
&[0x61u8, 0x87u8, 0x40u8, 0x62u8],
|
||||
&"\u{0061}\u{43F0}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0xFEu8, 0xFEu8, 0x62u8],
|
||||
&"\u{0061}\u{79D4}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0xFEu8, 0xFDu8, 0x62u8],
|
||||
&"\u{0061}\u{2910D}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0x62u8, 0x62u8],
|
||||
&"\u{0061}\u{00CA}\u{0304}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0x64u8, 0x62u8],
|
||||
&"\u{0061}\u{00CA}\u{030C}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0x66u8, 0x62u8],
|
||||
&"\u{0061}\u{00CA}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0xA3u8, 0x62u8],
|
||||
&"\u{0061}\u{00EA}\u{0304}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0xA5u8, 0x62u8],
|
||||
&"\u{0061}\u{00EA}\u{030C}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0xA7u8, 0x62u8],
|
||||
&"\u{0061}\u{00EA}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x99u8, 0xD4u8, 0x62u8],
|
||||
&"\u{0061}\u{8991}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x99u8, 0xD5u8, 0x62u8],
|
||||
&"\u{0061}\u{27967}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x99u8, 0xD6u8, 0x62u8],
|
||||
&"\u{0061}\u{8A29}\u{0062}",
|
||||
);
|
||||
|
||||
// Bad sequences
|
||||
decode_big5(&[0x80u8, 0x61u8], &"\u{FFFD}\u{0061}");
|
||||
decode_big5(&[0xFFu8, 0x61u8], &"\u{FFFD}\u{0061}");
|
||||
decode_big5(&[0xFEu8, 0x39u8], &"\u{FFFD}\u{0039}");
|
||||
decode_big5(&[0x87u8, 0x66u8], &"\u{FFFD}\u{0066}");
|
||||
decode_big5(&[0x81u8, 0x40u8], &"\u{FFFD}\u{0040}");
|
||||
decode_big5(&[0x61u8, 0x81u8], &"\u{0061}\u{FFFD}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_big5_encode() {
|
||||
// Empty
|
||||
encode_big5("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_big5("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
// Edge cases
|
||||
encode_big5("\u{9EA6}\u{0061}", b"麦\x61");
|
||||
encode_big5("\u{2626B}\u{0061}", b"𦉫\x61");
|
||||
encode_big5("\u{3000}", b"\xA1\x40");
|
||||
encode_big5("\u{20AC}", b"\xA3\xE1");
|
||||
encode_big5("\u{4E00}", b"\xA4\x40");
|
||||
encode_big5("\u{27607}", b"\xC8\xA4");
|
||||
encode_big5("\u{FFE2}", b"\xC8\xCD");
|
||||
encode_big5("\u{79D4}", b"\xFE\xFE");
|
||||
|
||||
// Not in index
|
||||
encode_big5("\u{2603}\u{0061}", b"☃\x61");
|
||||
|
||||
// duplicate low bits
|
||||
encode_big5("\u{203B5}", b"\xFD\x6A");
|
||||
encode_big5("\u{25605}", b"\xFE\x46");
|
||||
|
||||
// prefer last
|
||||
encode_big5("\u{2550}", b"\xF9\xF9");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_big5_decode_all() {
|
||||
let input = include_bytes!("test_data/big5_in.txt");
|
||||
let expectation = include_str!("test_data/big5_in_ref.txt");
|
||||
let (cow, had_errors) = BIG5.decode_without_bom_handling(input);
|
||||
assert!(had_errors, "Should have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_big5_encode_all() {
|
||||
let input = include_str!("test_data/big5_out.txt");
|
||||
let expectation = include_bytes!("test_data/big5_out_ref.txt");
|
||||
let (cow, encoding, had_errors) = BIG5.encode(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(encoding, BIG5);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,416 @@
|
|||
// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use handles::*;
|
||||
use data::*;
|
||||
use variant::*;
|
||||
use super::*;
|
||||
// Rust 1.14.0 requires the following despite the asterisk above.
|
||||
use super::in_inclusive_range16;
|
||||
|
||||
enum EucJpPending {
|
||||
None,
|
||||
Jis0208Lead(u8),
|
||||
Jis0212Shift,
|
||||
Jis0212Lead(u8),
|
||||
HalfWidthKatakana,
|
||||
}
|
||||
|
||||
impl EucJpPending {
|
||||
fn is_none(&self) -> bool {
|
||||
match *self {
|
||||
EucJpPending::None => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn count(&self) -> usize {
|
||||
match *self {
|
||||
EucJpPending::None => 0,
|
||||
EucJpPending::Jis0208Lead(_) |
|
||||
EucJpPending::Jis0212Shift |
|
||||
EucJpPending::HalfWidthKatakana => 1,
|
||||
EucJpPending::Jis0212Lead(_) => 2,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EucJpDecoder {
|
||||
pending: EucJpPending,
|
||||
}
|
||||
|
||||
impl EucJpDecoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::EucJp(EucJpDecoder { pending: EucJpPending::None })
|
||||
}
|
||||
|
||||
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_add(if self.pending.is_none() { 0 } else { 1 })
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
self.plus_one_if_lead(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
// worst case: 2 to 3
|
||||
let len = self.plus_one_if_lead(byte_length);
|
||||
checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2)))
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_mul(3, self.plus_one_if_lead(byte_length))
|
||||
}
|
||||
|
||||
euc_jp_decoder_functions!(
|
||||
{
|
||||
let trail_minus_offset = byte.wrapping_sub(0xA1);
|
||||
// Fast-track Hiragana (60% according to Lunde)
|
||||
// and Katakana (10% acconding to Lunde).
|
||||
if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 {
|
||||
// Hiragana
|
||||
handle.write_upper_bmp(0x3041 + trail_minus_offset as u16)
|
||||
} else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 {
|
||||
// Katakana
|
||||
handle.write_upper_bmp(0x30A1 + trail_minus_offset as u16)
|
||||
} else if trail_minus_offset > (0xFE - 0xA1) {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
} else {
|
||||
let pointer = mul_94(jis0208_lead_minus_offset) + trail_minus_offset as usize;
|
||||
let level1_pointer = pointer.wrapping_sub(1410);
|
||||
if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
|
||||
} else {
|
||||
let level2_pointer = pointer.wrapping_sub(4418);
|
||||
if level2_pointer < JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer])
|
||||
} else {
|
||||
let ibm_pointer = pointer.wrapping_sub(8272);
|
||||
if ibm_pointer < IBM_KANJI.len() {
|
||||
handle.write_upper_bmp(IBM_KANJI[ibm_pointer])
|
||||
} else if let Some(bmp) = jis0208_symbol_decode(pointer) {
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else if let Some(bmp) = jis0208_range_decode(pointer) {
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
// If lead is between 0xA1 and 0xFE, inclusive,
|
||||
// subtract 0xA1.
|
||||
let jis0212_lead_minus_offset = lead.wrapping_sub(0xA1);
|
||||
if jis0212_lead_minus_offset > (0xFE - 0xA1) {
|
||||
if lead < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_jis0212.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_jis0212.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
jis0212_lead_minus_offset
|
||||
},
|
||||
{
|
||||
// If trail is between 0xA1 and 0xFE, inclusive,
|
||||
// subtract 0xA1.
|
||||
let trail_minus_offset = byte.wrapping_sub(0xA1);
|
||||
if trail_minus_offset > (0xFE - 0xA1) {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(3, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
let pointer = mul_94(jis0212_lead_minus_offset) + trail_minus_offset as usize;
|
||||
let pointer_minus_kanji = pointer.wrapping_sub(1410);
|
||||
if pointer_minus_kanji < JIS0212_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0212_KANJI[pointer_minus_kanji])
|
||||
} else if let Some(bmp) = jis0212_accented_decode(pointer) {
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
let pointer_minus_upper_cyrillic = pointer.wrapping_sub(597);
|
||||
if pointer_minus_upper_cyrillic <= (607 - 597) {
|
||||
handle.write_mid_bmp(0x0402 + pointer_minus_upper_cyrillic as u16)
|
||||
} else {
|
||||
let pointer_minus_lower_cyrillic = pointer.wrapping_sub(645);
|
||||
if pointer_minus_lower_cyrillic <= (655 - 645) {
|
||||
handle.write_mid_bmp(0x0452 + pointer_minus_lower_cyrillic as u16)
|
||||
} else {
|
||||
return (DecoderResult::Malformed(3, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
// If trail is between 0xA1 and 0xDF, inclusive,
|
||||
// subtract 0xA1 and map to half-width Katakana.
|
||||
let trail_minus_offset = byte.wrapping_sub(0xA1);
|
||||
if trail_minus_offset > (0xDF - 0xA1) {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
handle.write_upper_bmp(0xFF61 + trail_minus_offset as u16)
|
||||
},
|
||||
self,
|
||||
non_ascii,
|
||||
jis0208_lead_minus_offset,
|
||||
byte,
|
||||
unread_handle_trail,
|
||||
jis0212_lead_minus_offset,
|
||||
lead,
|
||||
unread_handle_jis0212,
|
||||
source,
|
||||
handle
|
||||
);
|
||||
}
|
||||
|
||||
pub struct EucJpEncoder;
|
||||
|
||||
impl EucJpEncoder {
|
||||
pub fn new(encoding: &'static Encoding) -> Encoder {
|
||||
Encoder::new(encoding, VariantEncoder::EucJp(EucJpEncoder))
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(&self,
|
||||
u16_length: usize)
|
||||
-> Option<usize> {
|
||||
u16_length.checked_mul(2)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(&self,
|
||||
byte_length: usize)
|
||||
-> Option<usize> {
|
||||
byte_length.checked_add(1)
|
||||
}
|
||||
|
||||
ascii_compatible_bmp_encoder_functions!(
|
||||
{
|
||||
// Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
|
||||
let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
|
||||
if bmp_minus_hiragana < 0x53 {
|
||||
handle.write_two(0xA4, 0xA1 + bmp_minus_hiragana as u8)
|
||||
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
|
||||
if 0x4EDD == bmp {
|
||||
// Ideograph on the symbol row!
|
||||
handle.write_two(0xA1, 0xB8)
|
||||
} else if let Some((lead, trail)) = jis0208_level1_kanji_euc_jp_encode(bmp) {
|
||||
handle.write_two(lead, trail)
|
||||
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
|
||||
let lead = (pos / 94) + 0xD0;
|
||||
let trail = (pos % 94) + 0xA1;
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
|
||||
let lead = (pos / 94) + 0xF9;
|
||||
let trail = (pos % 94) + 0xA1;
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else {
|
||||
return (EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
} else {
|
||||
let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
|
||||
if bmp_minus_katakana < 0x56 {
|
||||
handle.write_two(0xA5, 0xA1 + bmp_minus_katakana as u8)
|
||||
} else {
|
||||
let bmp_minus_space = bmp.wrapping_sub(0x3000);
|
||||
if bmp_minus_space < 3 {
|
||||
// fast-track common punctuation
|
||||
handle.write_two(0xA1, 0xA1 + bmp_minus_space as u8)
|
||||
} else if bmp == 0xA5 {
|
||||
handle.write_one(0x5Cu8)
|
||||
} else if bmp == 0x203E {
|
||||
handle.write_one(0x7Eu8)
|
||||
} else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
|
||||
handle.write_two(0x8Eu8, (bmp - (0xFF61 - 0xA1)) as u8)
|
||||
} else if bmp == 0x2212 {
|
||||
handle.write_two(0xA1u8, 0xDDu8)
|
||||
} else if let Some(pointer) = jis0208_range_encode(bmp) {
|
||||
let lead = (pointer / 94) + 0xA1;
|
||||
let trail = (pointer % 94) + 0xA1;
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) || bmp == 0xF929 ||
|
||||
bmp == 0xF9DC {
|
||||
// Guaranteed to be found in IBM_KANJI
|
||||
let pos = position(&IBM_KANJI[..], bmp).unwrap();
|
||||
let lead = (pos / 94) + 0xF9;
|
||||
let trail = (pos % 94) + 0xA1;
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else if let Some(pointer) = ibm_symbol_encode(bmp) {
|
||||
let lead = (pointer / 94) + 0xA1;
|
||||
let trail = (pointer % 94) + 0xA1;
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else if let Some(pointer) = jis0208_symbol_encode(bmp) {
|
||||
let lead = (pointer / 94) + 0xA1;
|
||||
let trail = (pointer % 94) + 0xA1;
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else {
|
||||
return (EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
bmp,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_two,
|
||||
check_space_two,
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_euc_jp(bytes: &[u8], expect: &str) {
|
||||
decode(EUC_JP, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_euc_jp(string: &str, expect: &[u8]) {
|
||||
encode(EUC_JP, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euc_jp_decode() {
|
||||
// Empty
|
||||
decode_euc_jp(b"", &"");
|
||||
|
||||
// ASCII
|
||||
decode_euc_jp(b"\x61\x62", "\u{0061}\u{0062}");
|
||||
|
||||
// Half-width
|
||||
decode_euc_jp(b"\x8E\xA1", "\u{FF61}");
|
||||
decode_euc_jp(b"\x8E\xDF", "\u{FF9F}");
|
||||
decode_euc_jp(b"\x8E\xA0", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8E\xE0", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8E\xFF", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8E", "\u{FFFD}");
|
||||
|
||||
// JIS 0212
|
||||
decode_euc_jp(b"\x8F\xA1\xA1", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8F\xA2\xAF", "\u{02D8}");
|
||||
decode_euc_jp(b"\x8F\xA2\xFF", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8F\xA1", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8F", "\u{FFFD}");
|
||||
|
||||
// JIS 0208
|
||||
decode_euc_jp(b"\xA1\xA1", "\u{3000}");
|
||||
decode_euc_jp(b"\xA1\xA0", "\u{FFFD}");
|
||||
decode_euc_jp(b"\xFC\xFE", "\u{FF02}");
|
||||
decode_euc_jp(b"\xFE\xFE", "\u{FFFD}");
|
||||
decode_euc_jp(b"\xA1", "\u{FFFD}");
|
||||
|
||||
// Bad leads
|
||||
decode_euc_jp(b"\xFF\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\xA0\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x80\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x81\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x82\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x83\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x84\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x85\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x86\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x87\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x88\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x89\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x8A\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x8B\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x8C\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x8D\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
|
||||
// Bad ASCII trail
|
||||
decode_euc_jp(b"\xA1\x40", "\u{FFFD}\u{0040}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euc_jp_encode() {
|
||||
// Empty
|
||||
encode_euc_jp("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_euc_jp("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
// Exceptional code points
|
||||
encode_euc_jp("\u{00A5}", b"\x5C");
|
||||
encode_euc_jp("\u{203E}", b"\x7E");
|
||||
encode_euc_jp("\u{2212}", b"\xA1\xDD");
|
||||
|
||||
// Half-width
|
||||
encode_euc_jp("\u{FF61}", b"\x8E\xA1");
|
||||
encode_euc_jp("\u{FF9F}", b"\x8E\xDF");
|
||||
|
||||
// JIS 0212
|
||||
encode_euc_jp("\u{02D8}", b"˘");
|
||||
|
||||
// JIS 0208
|
||||
encode_euc_jp("\u{3000}", b"\xA1\xA1");
|
||||
encode_euc_jp("\u{FF02}", b"\xFC\xFE");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jis0208_decode_all() {
|
||||
let input = include_bytes!("test_data/jis0208_in.txt");
|
||||
let expectation = include_str!("test_data/jis0208_in_ref.txt");
|
||||
let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
|
||||
assert!(had_errors, "Should have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jis0208_encode_all() {
|
||||
let input = include_str!("test_data/jis0208_out.txt");
|
||||
let expectation = include_bytes!("test_data/jis0208_out_ref.txt");
|
||||
let (cow, encoding, had_errors) = EUC_JP.encode(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(encoding, EUC_JP);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jis0212_decode_all() {
|
||||
let input = include_bytes!("test_data/jis0212_in.txt");
|
||||
let expectation = include_str!("test_data/jis0212_in_ref.txt");
|
||||
let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
|
||||
assert!(had_errors, "Should have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,382 @@
|
|||
// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use handles::*;
|
||||
use data::*;
|
||||
use variant::*;
|
||||
use super::*;
|
||||
// Rust 1.14.0 requires the following despite the asterisk above.
|
||||
use super::in_range16;
|
||||
use super::in_inclusive_range16;
|
||||
|
||||
pub struct EucKrDecoder {
|
||||
lead: Option<u8>,
|
||||
}
|
||||
|
||||
impl EucKrDecoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::EucKr(EucKrDecoder { lead: None })
|
||||
}
|
||||
|
||||
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_add(
|
||||
match self.lead {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
self.plus_one_if_lead(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
// worst case: 2 to 3
|
||||
let len = self.plus_one_if_lead(byte_length);
|
||||
checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2)))
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_mul(3, self.plus_one_if_lead(byte_length))
|
||||
}
|
||||
|
||||
ascii_compatible_two_byte_decoder_functions!(
|
||||
{
|
||||
// If lead is between 0x81 and 0xFE, inclusive,
|
||||
// subtract offset 0x81.
|
||||
let non_ascii_minus_offset =
|
||||
non_ascii.wrapping_sub(0x81);
|
||||
if non_ascii_minus_offset > (0xFE - 0x81) {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
non_ascii_minus_offset
|
||||
},
|
||||
{
|
||||
if lead_minus_offset >= 0x20 {
|
||||
// Not the extension range above KS X 1001
|
||||
let trail_minus_offset =
|
||||
byte.wrapping_sub(0xA1);
|
||||
if trail_minus_offset <= (0xFE - 0xA1) {
|
||||
// KS X 1001
|
||||
let ksx_pointer = mul_94(lead_minus_offset - 0x20) + trail_minus_offset as usize;
|
||||
let hangul_pointer = ksx_pointer.wrapping_sub((0x2F - 0x20) * 94);
|
||||
if hangul_pointer < KSX1001_HANGUL.len() {
|
||||
let upper_bmp = KSX1001_HANGUL[hangul_pointer];
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else if ksx_pointer < KSX1001_SYMBOLS.len() {
|
||||
let bmp = KSX1001_SYMBOLS[ksx_pointer];
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
let hanja_pointer = ksx_pointer.wrapping_sub((0x49 - 0x20) * 94);
|
||||
if hanja_pointer < KSX1001_HANJA.len() {
|
||||
let upper_bmp = KSX1001_HANJA[hanja_pointer];
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else if (lead_minus_offset == 0x27) && ((trail_minus_offset as usize) < KSX1001_UPPERCASE.len()) {
|
||||
let mid_bmp = KSX1001_UPPERCASE[trail_minus_offset as usize];
|
||||
if mid_bmp == 0 {
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
handle.write_mid_bmp(mid_bmp)
|
||||
} else if (lead_minus_offset == 0x28) && ((trail_minus_offset as usize) < KSX1001_LOWERCASE.len()) {
|
||||
let mid_bmp = KSX1001_LOWERCASE[trail_minus_offset as usize];
|
||||
handle.write_mid_bmp(mid_bmp)
|
||||
} else if (lead_minus_offset == 0x25) && ((trail_minus_offset as usize) < KSX1001_BOX.len()) {
|
||||
let upper_bmp = KSX1001_BOX[trail_minus_offset as usize];
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else {
|
||||
let other_pointer = ksx_pointer.wrapping_sub(2 * 94);
|
||||
if other_pointer < 0x039F {
|
||||
let bmp = ksx1001_other_decode(other_pointer as u16);
|
||||
// ASCII range means unassigned
|
||||
if bmp < 0x80 {
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Extension range to the left of
|
||||
// KS X 1001
|
||||
let left_lead = lead_minus_offset - 0x20;
|
||||
let left_trail = if byte.wrapping_sub(0x40 + 0x41) < (0x60 - 0x40) {
|
||||
byte - (12 + 0x41)
|
||||
} else if byte.wrapping_sub(0x20 + 0x41) < (0x3A - 0x20) {
|
||||
byte - (6 + 0x41)
|
||||
} else if byte.wrapping_sub(0x41) < 0x1A {
|
||||
byte - 0x41
|
||||
} else {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
};
|
||||
let left_pointer = ((left_lead as usize) * (190 - 94 - 12)) + left_trail as usize;
|
||||
if left_pointer < (0x45 - 0x20) * (190 - 94 - 12) + 0x12 {
|
||||
let upper_bmp = cp949_left_hangul_decode(left_pointer as u16);
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Extension range above KS X 1001
|
||||
let top_trail = if byte.wrapping_sub(0x40 + 0x41) < (0xBE - 0x40) {
|
||||
byte - (12 + 0x41)
|
||||
} else if byte.wrapping_sub(0x20 + 0x41) < (0x3A - 0x20) {
|
||||
byte - (6 + 0x41)
|
||||
} else if byte.wrapping_sub(0x41) < 0x1A {
|
||||
byte - 0x41
|
||||
} else {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
};
|
||||
let top_pointer = ((lead_minus_offset as usize) * (190 - 12)) + top_trail as usize;
|
||||
let upper_bmp = cp949_top_hangul_decode(top_pointer as u16);
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
}
|
||||
},
|
||||
self,
|
||||
non_ascii,
|
||||
byte,
|
||||
lead_minus_offset,
|
||||
unread_handle_trail,
|
||||
source,
|
||||
handle,
|
||||
'outermost,
|
||||
copy_ascii_from_check_space_bmp,
|
||||
check_space_bmp,
|
||||
true);
|
||||
}
|
||||
|
||||
fn ksx1001_encode_misc(bmp: u16) -> Option<(usize, usize)> {
|
||||
if in_inclusive_range16(bmp, 0x3000, 0x3015) {
|
||||
if let Some(pos) = position(&KSX1001_SYMBOLS[..(0xAB - 0x60)], bmp) {
|
||||
return Some((0xA1, pos + 0xA1));
|
||||
}
|
||||
}
|
||||
if let Some(other_pointer) = ksx1001_other_encode(bmp) {
|
||||
let other_lead = ((other_pointer as usize) / 94) + (0x81 + 0x22);
|
||||
let other_trail = ((other_pointer as usize) % 94) + 0xA1;
|
||||
return Some((other_lead, other_trail));
|
||||
}
|
||||
if in_range16(bmp, 0x00AA, 0x0168) {
|
||||
// Latin
|
||||
if let Some(pos) = position(&KSX1001_LOWERCASE[..], bmp) {
|
||||
return Some((0x81 + 0x28, 0xA1 + pos));
|
||||
}
|
||||
if let Some(pos) = position(&KSX1001_UPPERCASE[..], bmp) {
|
||||
return Some((0x81 + 0x27, 0xA1 + pos));
|
||||
}
|
||||
} else if in_range16(bmp, 0x2500, 0x254C) {
|
||||
if let Some(pos) = position(&KSX1001_BOX[..], bmp) {
|
||||
return Some((0x81 + 0x25, 0xA1 + pos));
|
||||
}
|
||||
}
|
||||
if in_inclusive_range16(bmp, 0x2015, 0x266D) || in_inclusive_range16(bmp, 0x321C, 0x33D8) ||
|
||||
in_inclusive_range16(bmp, 0xFF3C, 0xFFE5) ||
|
||||
in_inclusive_range16(bmp, 0x00A1, 0x00F7) ||
|
||||
in_inclusive_range16(bmp, 0x02C7, 0x02DD) {
|
||||
if let Some(pos) = position(&KSX1001_SYMBOLS[3..], bmp) {
|
||||
if pos < (94 - 3) {
|
||||
return Some((0xA1, pos + 0xA1 + 3));
|
||||
}
|
||||
return Some((0xA2, pos - (94 - 3) + 0xA1));
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub struct EucKrEncoder;
|
||||
|
||||
impl EucKrEncoder {
|
||||
pub fn new(encoding: &'static Encoding) -> Encoder {
|
||||
Encoder::new(encoding, VariantEncoder::EucKr(EucKrEncoder))
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(&self,
|
||||
u16_length: usize)
|
||||
-> Option<usize> {
|
||||
u16_length.checked_mul(2)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(&self,
|
||||
byte_length: usize)
|
||||
-> Option<usize> {
|
||||
byte_length.checked_add(1)
|
||||
}
|
||||
|
||||
ascii_compatible_bmp_encoder_functions!(
|
||||
{
|
||||
let bmp_minus_hangul_start = bmp.wrapping_sub(0xAC00);
|
||||
let (lead, trail) = if bmp_minus_hangul_start < (0xD7A4 - 0xAC00) {
|
||||
// Hangul
|
||||
match KSX1001_HANGUL.binary_search(&bmp) {
|
||||
Ok(ksx_hangul_pointer) => {
|
||||
let ksx_hangul_lead = (ksx_hangul_pointer / 94) + (0x81 + 0x2F);
|
||||
let ksx_hangul_trail = (ksx_hangul_pointer % 94) + 0xA1;
|
||||
(ksx_hangul_lead, ksx_hangul_trail)
|
||||
}
|
||||
Err(_) => {
|
||||
let (lead, cp949_trail) = if bmp < 0xC8A5 {
|
||||
// Above KS X 1001
|
||||
let top_pointer = cp949_top_hangul_encode(bmp) as usize;
|
||||
let top_lead = (top_pointer / (190 - 12)) + 0x81;
|
||||
let top_trail = top_pointer % (190 - 12);
|
||||
(top_lead, top_trail)
|
||||
} else {
|
||||
// To the left of KS X 1001
|
||||
let left_pointer = cp949_left_hangul_encode(bmp) as usize;
|
||||
let left_lead = (left_pointer / (190 - 94 - 12)) + (0x81 + 0x20);
|
||||
let left_trail = left_pointer % (190 - 94 - 12);
|
||||
(left_lead, left_trail)
|
||||
};
|
||||
let offset = if cp949_trail >= (0x40 - 12) {
|
||||
0x41 + 12
|
||||
} else if cp949_trail >= (0x20 - 6) {
|
||||
0x41 + 6
|
||||
} else {
|
||||
0x41
|
||||
};
|
||||
(lead, cp949_trail + offset)
|
||||
}
|
||||
}
|
||||
} else if in_range16(bmp, 0x33DE, 0xFF01) {
|
||||
// Vast range that includes no other
|
||||
// mappables except Hangul (already
|
||||
// processed) and Hanja.
|
||||
// Narrow the range further to Unified and
|
||||
// Compatibility ranges of Hanja.
|
||||
if in_range16(bmp, 0x4E00, 0x9F9D) || in_range16(bmp, 0xF900, 0xFA0C) {
|
||||
if let Some(hanja_pointer) = position(&KSX1001_HANJA[..], bmp) {
|
||||
let hanja_lead = (hanja_pointer / 94) + (0x81 + 0x49);
|
||||
let hanja_trail = (hanja_pointer % 94) + 0xA1;
|
||||
(hanja_lead, hanja_trail)
|
||||
} else {
|
||||
return (EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
} else {
|
||||
return (EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
} else if let Some((lead, trail)) = ksx1001_encode_misc(bmp) {
|
||||
(lead, trail)
|
||||
} else {
|
||||
return (EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
};
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
},
|
||||
bmp,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_two,
|
||||
check_space_two,
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_euc_kr(bytes: &[u8], expect: &str) {
|
||||
decode(EUC_KR, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_euc_kr(string: &str, expect: &[u8]) {
|
||||
encode(EUC_KR, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euc_kr_decode() {
|
||||
// Empty
|
||||
decode_euc_kr(b"", &"");
|
||||
|
||||
// ASCII
|
||||
decode_euc_kr(b"\x61\x62", "\u{0061}\u{0062}");
|
||||
|
||||
decode_euc_kr(b"\x81\x41", "\u{AC02}");
|
||||
decode_euc_kr(b"\x81\x5B", "\u{FFFD}\x5B");
|
||||
decode_euc_kr(b"\xFD\xFE", "\u{8A70}");
|
||||
decode_euc_kr(b"\xFE\x41", "\u{FFFD}\x41");
|
||||
decode_euc_kr(b"\xFF\x41", "\u{FFFD}\x41");
|
||||
decode_euc_kr(b"\x80\x41", "\u{FFFD}\x41");
|
||||
decode_euc_kr(b"\xA1\xFF", "\u{FFFD}");
|
||||
decode_euc_kr(b"\x81\xFF", "\u{FFFD}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euc_kr_encode() {
|
||||
// Empty
|
||||
encode_euc_kr("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_euc_kr("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
encode_euc_kr("\u{AC02}", b"\x81\x41");
|
||||
encode_euc_kr("\u{8A70}", b"\xFD\xFE");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euc_kr_decode_all() {
|
||||
let input = include_bytes!("test_data/euc_kr_in.txt");
|
||||
let expectation = include_str!("test_data/euc_kr_in_ref.txt");
|
||||
let (cow, had_errors) = EUC_KR.decode_without_bom_handling(input);
|
||||
assert!(had_errors, "Should have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euc_kr_encode_all() {
|
||||
let input = include_str!("test_data/euc_kr_out.txt");
|
||||
let expectation = include_bytes!("test_data/euc_kr_out_ref.txt");
|
||||
let (cow, encoding, had_errors) = EUC_KR.encode(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(encoding, EUC_KR);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,721 @@
|
|||
// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use handles::*;
|
||||
use data::*;
|
||||
use variant::*;
|
||||
use super::*;
|
||||
// Rust 1.14.0 requires the following despite the asterisk above.
|
||||
use super::in_inclusive_range16;
|
||||
use super::in_range16;
|
||||
|
||||
enum Gb18030Pending {
|
||||
None,
|
||||
One(u8),
|
||||
Two(u8, u8),
|
||||
Three(u8, u8, u8),
|
||||
}
|
||||
|
||||
impl Gb18030Pending {
|
||||
fn is_none(&self) -> bool {
|
||||
match *self {
|
||||
Gb18030Pending::None => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn count(&self) -> usize {
|
||||
match *self {
|
||||
Gb18030Pending::None => 0,
|
||||
Gb18030Pending::One(_) => 1,
|
||||
Gb18030Pending::Two(_, _) => 2,
|
||||
Gb18030Pending::Three(_, _, _) => 3,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Gb18030Decoder {
|
||||
first: Option<u8>,
|
||||
second: Option<u8>,
|
||||
third: Option<u8>,
|
||||
pending: Gb18030Pending,
|
||||
pending_ascii: Option<u8>,
|
||||
}
|
||||
|
||||
impl Gb18030Decoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::Gb18030(
|
||||
Gb18030Decoder {
|
||||
first: None,
|
||||
second: None,
|
||||
third: None,
|
||||
pending: Gb18030Pending::None,
|
||||
pending_ascii: None,
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
fn extra_from_state(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_add(
|
||||
self.pending.count() +
|
||||
match self.first {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
} +
|
||||
match self.second {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
} +
|
||||
match self.third {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
} +
|
||||
match self.pending_ascii {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
// ASCII: 1 to 1 (worst case)
|
||||
// gbk: 2 to 1
|
||||
// ranges: 4 to 1 or 4 to 2
|
||||
checked_add(1, self.extra_from_state(byte_length))
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
// ASCII: 1 to 1
|
||||
// gbk: 2 to 2 or 2 to 3
|
||||
// ranges: 4 to 2, 4 to 3 or 4 to 4
|
||||
// 0x80: 1 to 3 (worst case)
|
||||
self.max_utf8_buffer_length(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_add(1, checked_mul(3, self.extra_from_state(byte_length)))
|
||||
}
|
||||
|
||||
gb18030_decoder_functions!(
|
||||
{
|
||||
// If first is between 0x81 and 0xFE, inclusive,
|
||||
// subtract offset 0x81.
|
||||
let non_ascii_minus_offset = non_ascii.wrapping_sub(0x81);
|
||||
if non_ascii_minus_offset > (0xFE - 0x81) {
|
||||
if non_ascii == 0x80 {
|
||||
handle.write_upper_bmp(0x20ACu16);
|
||||
continue 'outermost;
|
||||
}
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
non_ascii_minus_offset
|
||||
},
|
||||
{
|
||||
// Two-byte (or error)
|
||||
if first_minus_offset >= 0x20 {
|
||||
// Not the gbk ideograph range above GB2312
|
||||
let trail_minus_offset = second.wrapping_sub(0xA1);
|
||||
if trail_minus_offset <= (0xFE - 0xA1) {
|
||||
// GB2312
|
||||
let hanzi_lead = first_minus_offset.wrapping_sub(0x2F);
|
||||
if hanzi_lead < (0x77 - 0x2F) {
|
||||
// Level 1 Hanzi, Level 2 Hanzi
|
||||
// or one of the 5 PUA code
|
||||
// points in between.
|
||||
let hanzi_pointer = mul_94(hanzi_lead) + trail_minus_offset as usize;
|
||||
let upper_bmp = GB2312_HANZI[hanzi_pointer];
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else if first_minus_offset == 0x20 {
|
||||
// Symbols (starting with ideographic space)
|
||||
let bmp = GB2312_SYMBOLS[trail_minus_offset as usize];
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else if first_minus_offset == 0x25 && ((trail_minus_offset.wrapping_sub(63) as usize) < GB2312_SYMBOLS_AFTER_GREEK.len()) {
|
||||
handle.write_bmp_excl_ascii(GB2312_SYMBOLS_AFTER_GREEK[trail_minus_offset.wrapping_sub(63) as usize])
|
||||
} else if first_minus_offset == 0x27 && (trail_minus_offset as usize) < GB2312_PINYIN.len() {
|
||||
handle.write_bmp_excl_ascii(GB2312_PINYIN[trail_minus_offset as usize])
|
||||
} else if first_minus_offset > 0x76 {
|
||||
// Bottom PUA
|
||||
let pua = (0xE234 + mul_94(first_minus_offset - 0x77) + trail_minus_offset as usize) as u16;
|
||||
handle.write_upper_bmp(pua)
|
||||
} else {
|
||||
let bmp = gb2312_other_decode((mul_94(first_minus_offset - 0x21) + (trail_minus_offset as usize)) as u16);
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
}
|
||||
} else {
|
||||
// gbk range on the left
|
||||
let mut trail_minus_offset = second.wrapping_sub(0x40);
|
||||
if trail_minus_offset > (0x7E - 0x40) {
|
||||
let trail_minus_range_start = second.wrapping_sub(0x80);
|
||||
if trail_minus_range_start > (0xA0 - 0x80) {
|
||||
if second < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_second.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_second.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
trail_minus_offset = second - 0x41;
|
||||
}
|
||||
// Zero-base lead
|
||||
let left_lead = first_minus_offset - 0x20;
|
||||
let left_pointer = left_lead as usize * (190 - 94) +
|
||||
trail_minus_offset as usize;
|
||||
let gbk_left_ideograph_pointer = left_pointer.wrapping_sub((0x29 - 0x20) * (190 - 94));
|
||||
if gbk_left_ideograph_pointer < (((0x7D - 0x29) * (190 - 94)) - 5) {
|
||||
let upper_bmp = gbk_left_ideograph_decode(gbk_left_ideograph_pointer as u16);
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else if left_pointer < ((0x29 - 0x20) * (190 - 94)) {
|
||||
let bmp = gbk_other_decode(left_pointer as u16);
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
let bottom_pointer = left_pointer - (((0x7D - 0x20) * (190 - 94)) - 5);
|
||||
let upper_bmp = GBK_BOTTOM[bottom_pointer];
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// gbk ideograph range above GB2312
|
||||
let mut trail_minus_offset = second.wrapping_sub(0x40);
|
||||
if trail_minus_offset > (0x7E - 0x40) {
|
||||
let trail_minus_range_start = second.wrapping_sub(0x80);
|
||||
if trail_minus_range_start > (0xFE - 0x80) {
|
||||
if second < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_second.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_second.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
trail_minus_offset = second - 0x41;
|
||||
}
|
||||
let pointer = first_minus_offset as usize * 190usize +
|
||||
trail_minus_offset as usize;
|
||||
let upper_bmp = gbk_top_ideograph_decode(pointer as u16);
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
}
|
||||
},
|
||||
{
|
||||
// If third is between 0x81 and 0xFE, inclusive,
|
||||
// subtract offset 0x81.
|
||||
let third_minus_offset = third.wrapping_sub(0x81);
|
||||
if third_minus_offset > (0xFE - 0x81) {
|
||||
// We have an error. Let's inline what's going
|
||||
// to happen when `second` is
|
||||
// reprocessed. (`third` gets unread.)
|
||||
// `second` is guaranteed ASCII, so let's
|
||||
// put it in `pending_ascii`. Recompute
|
||||
// `second` from `second_minus_offset`.
|
||||
self.pending_ascii = Some(second_minus_offset + 0x30);
|
||||
// Now unread `third` and designate the previous
|
||||
// `first` as being in error.
|
||||
return (DecoderResult::Malformed(1, 1),
|
||||
unread_handle_third.unread(),
|
||||
handle.written());
|
||||
}
|
||||
third_minus_offset
|
||||
},
|
||||
{
|
||||
// If fourth is between 0x30 and 0x39, inclusive,
|
||||
// subtract offset 0x30.
|
||||
//
|
||||
// If we have an error, we'll inline what's going
|
||||
// to happen when `second` and `third` are
|
||||
// reprocessed. (`fourth` gets unread.)
|
||||
// `second` is guaranteed ASCII, so let's
|
||||
// put it in `pending_ascii`. Recompute
|
||||
// `second` from `second_minus_offset` to
|
||||
// make this block reusable when `second`
|
||||
// is not in scope.
|
||||
//
|
||||
// `third` is guaranteed to be in the range
|
||||
// that makes it become the new `self.first`.
|
||||
//
|
||||
// `fourth` gets unread and the previous
|
||||
// `first` gets designates as being in error.
|
||||
let fourth_minus_offset = fourth.wrapping_sub(0x30);
|
||||
if fourth_minus_offset > (0x39 - 0x30) {
|
||||
self.pending_ascii = Some(second_minus_offset + 0x30);
|
||||
self.pending = Gb18030Pending::One(third_minus_offset);
|
||||
return (DecoderResult::Malformed(1, 2),
|
||||
unread_handle_fourth.unread(),
|
||||
handle.written());
|
||||
}
|
||||
let pointer = (first_minus_offset as usize * (10 * 126 * 10)) +
|
||||
(second_minus_offset as usize * (10 * 126)) +
|
||||
(third_minus_offset as usize * 10) +
|
||||
fourth_minus_offset as usize;
|
||||
if pointer <= 39419 {
|
||||
// BMP
|
||||
if pointer == 7457 {
|
||||
handle.write_upper_bmp(0xE7C7)
|
||||
} else {
|
||||
handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16))
|
||||
}
|
||||
} else if pointer >= 189000 && pointer <= 1237575 {
|
||||
// Astral
|
||||
handle.write_astral((pointer - (189000usize - 0x10000usize)) as u32)
|
||||
} else {
|
||||
return (DecoderResult::Malformed(4, 0),
|
||||
unread_handle_fourth.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
},
|
||||
self,
|
||||
non_ascii,
|
||||
first_minus_offset,
|
||||
second,
|
||||
second_minus_offset,
|
||||
unread_handle_second,
|
||||
third,
|
||||
third_minus_offset,
|
||||
unread_handle_third,
|
||||
fourth,
|
||||
fourth_minus_offset,
|
||||
unread_handle_fourth,
|
||||
source,
|
||||
handle,
|
||||
'outermost);
|
||||
}
|
||||
|
||||
// XXX Experiment with inline directives
|
||||
fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> {
|
||||
// Try ideographic punctuation first as it's the most likely case.
|
||||
// Throwing in the check for full-width currencies and tilde is probably
|
||||
// more size-efficient here than elsewhere.
|
||||
if in_inclusive_range16(bmp, 0x2014, 0x3017) || in_inclusive_range16(bmp, 0xFF04, 0xFFE1) {
|
||||
if let Some(pos) = position(&GB2312_SYMBOLS[..], bmp) {
|
||||
return Some((0xA1, pos + 0xA1));
|
||||
}
|
||||
}
|
||||
// Ext A
|
||||
if in_range16(bmp, 0x3400, 0x4E00) {
|
||||
return position(&GBK_BOTTOM[21..100], bmp).map(
|
||||
|pos| {
|
||||
(0xFE,
|
||||
pos +
|
||||
if pos < (0x3F - 16) {
|
||||
0x40 + 16
|
||||
} else {
|
||||
0x41 + 16
|
||||
})
|
||||
}
|
||||
);
|
||||
}
|
||||
// Compatibility ideographs
|
||||
if in_range16(bmp, 0xF900, 0xFB00) {
|
||||
return position(&GBK_BOTTOM[0..21], bmp).map(
|
||||
|pos| {
|
||||
if pos < 5 {
|
||||
// end of second to last row
|
||||
(0xFD, pos + (190 - 94 - 5 + 0x41))
|
||||
} else {
|
||||
// last row
|
||||
(0xFE, pos + (0x40 - 5))
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
// Handle everything below U+02CA, which is in GBK_OTHER.
|
||||
if bmp < 0x02CA {
|
||||
if in_range16(bmp, 0x00E0, 0x0262) && bmp != 0x00F7 {
|
||||
// Pinyin except U+1E3F
|
||||
if let Some(pos) = position(&GB2312_PINYIN[..], bmp) {
|
||||
return Some((0xA8, pos + 0xA1));
|
||||
}
|
||||
} else if in_inclusive_range16(bmp, 0x00A4, 0x00F7) ||
|
||||
in_inclusive_range16(bmp, 0x02C7, 0x02C9) {
|
||||
// Diacritics and Latin 1 symbols
|
||||
if let Some(pos) = position(&GB2312_SYMBOLS[3..(0xAC - 0x60)], bmp) {
|
||||
return Some((0xA1, pos + 0xA1 + 3));
|
||||
}
|
||||
}
|
||||
return None;
|
||||
}
|
||||
if bmp >= 0xE794 {
|
||||
// Various brackets, all in PUA or full-width regions
|
||||
if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) {
|
||||
return Some((0xA6, pos + (0x9F - 0x60 + 0xA1)));
|
||||
}
|
||||
} else if bmp == 0x1E3F {
|
||||
// The one Pinyin placed elsewhere on the BMP
|
||||
return Some((0xA8, 0x7B - 0x60 + 0xA1));
|
||||
} else if in_range16(bmp, 0xA000, 0xD800) {
|
||||
// Since Korean has usage in China, let's spend a branch to fast-track
|
||||
// Hangul.
|
||||
return None;
|
||||
}
|
||||
// GB2312 other (except bottom PUA and PUA between Hanzi levels).
|
||||
if let Some(other_pointer) = gb2312_other_encode(bmp) {
|
||||
let other_lead = other_pointer as usize / 94;
|
||||
let other_trail = other_pointer as usize % 94;
|
||||
return Some((0xA2 + other_lead, 0xA1 + other_trail));
|
||||
}
|
||||
// At this point, we've handled all mappable characters above U+02D9 but
|
||||
// below U+2010. Let's check for that range in order to let lower BMP
|
||||
// characters used for minority languages in China avoid the subsequent
|
||||
// search that deals mainly with various symbols.
|
||||
if in_range16(bmp, 0x02DA, 0x2010) {
|
||||
return None;
|
||||
}
|
||||
// GBK other (except radicals and PUA in GBK_BOTTOM).
|
||||
if let Some(other_pointer) = gbk_other_encode(bmp) {
|
||||
let other_lead = other_pointer as usize / (190 - 94);
|
||||
let other_trail = other_pointer as usize % (190 - 94);
|
||||
let offset = if other_trail < 0x3F { 0x40 } else { 0x41 };
|
||||
return Some((other_lead + (0x81 + 0x20), other_trail + offset));
|
||||
}
|
||||
// CJK Radicals Supplement or PUA in GBK_BOTTOM
|
||||
if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) || in_inclusive_range16(bmp, 0xE816, 0xE864) {
|
||||
if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) {
|
||||
let trail = pos + 16;
|
||||
let offset = if trail < 0x3F { 0x40 } else { 0x41 };
|
||||
return Some((0xFE, trail + offset));
|
||||
}
|
||||
}
|
||||
// GB2312 bottom PUA
|
||||
let bmp_minus_gb2312_bottom_pua = bmp.wrapping_sub(0xE234);
|
||||
if bmp_minus_gb2312_bottom_pua <= (0xE4C5 - 0xE234) {
|
||||
let pua_lead = bmp_minus_gb2312_bottom_pua as usize / 94;
|
||||
let pua_trail = bmp_minus_gb2312_bottom_pua as usize % 94;
|
||||
return Some((0x81 + 0x77 + pua_lead, 0xA1 + pua_trail));
|
||||
}
|
||||
// PUA between Hanzi Levels
|
||||
let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(0xE810);
|
||||
if bmp_minus_pua_between_hanzi < 5 {
|
||||
return Some(
|
||||
(0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize),
|
||||
);
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub struct Gb18030Encoder {
|
||||
extended: bool,
|
||||
}
|
||||
|
||||
impl Gb18030Encoder {
|
||||
pub fn new(encoding: &'static Encoding, extended_range: bool) -> Encoder {
|
||||
Encoder::new(
|
||||
encoding,
|
||||
VariantEncoder::Gb18030(Gb18030Encoder { extended: extended_range }),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(&self,
|
||||
u16_length: usize)
|
||||
-> Option<usize> {
|
||||
if self.extended {
|
||||
u16_length.checked_mul(4)
|
||||
} else {
|
||||
// Need to add, because space check is done with the four-byte
|
||||
// assumption.
|
||||
checked_add(2, u16_length.checked_mul(2))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(&self,
|
||||
byte_length: usize)
|
||||
-> Option<usize> {
|
||||
if self.extended {
|
||||
// 1 to 1
|
||||
// 2 to 2
|
||||
// 3 to 2
|
||||
// 2 to 4 (worst)
|
||||
// 3 to 4
|
||||
// 4 to 4
|
||||
checked_add(2, byte_length.checked_mul(2))
|
||||
} else {
|
||||
// 1 to 1
|
||||
// 2 to 2
|
||||
// 3 to 2
|
||||
// Need to add, because space check is done with the four-byte
|
||||
// assumption.
|
||||
byte_length.checked_add(3)
|
||||
}
|
||||
}
|
||||
|
||||
ascii_compatible_encoder_functions!(
|
||||
{
|
||||
let bmp_minus_unified_start = bmp.wrapping_sub(0x4E00);
|
||||
if bmp_minus_unified_start < (0x9FA6 - 0x4E00) {
|
||||
// CJK Unified Ideographs
|
||||
// Can't fail now, since all are
|
||||
// mapped.
|
||||
// XXX Can we do something smarter
|
||||
// than linear search for GB2312
|
||||
// Level 2 Hanzi, which are almost
|
||||
// Unicode-ordered?
|
||||
if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) {
|
||||
handle.write_two(lead, trail)
|
||||
} else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) {
|
||||
let hanzi_lead = (hanzi_pointer / 94) + (0xD8);
|
||||
let hanzi_trail = (hanzi_pointer % 94) + 0xA1;
|
||||
handle.write_two(hanzi_lead as u8, hanzi_trail as u8)
|
||||
} else {
|
||||
let (lead, gbk_trail) = if bmp < 0x72DC {
|
||||
// Above GB2312
|
||||
let pointer = gbk_top_ideograph_encode(bmp) as usize;
|
||||
let lead = (pointer / 190) + 0x81;
|
||||
let gbk_trail = pointer % 190;
|
||||
(lead, gbk_trail)
|
||||
} else {
|
||||
// To the left of GB2312
|
||||
let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize;
|
||||
let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29);
|
||||
let gbk_trail = gbk_left_ideograph_pointer % (190 - 94);
|
||||
(lead, gbk_trail)
|
||||
};
|
||||
let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 };
|
||||
handle.write_two(lead as u8, (gbk_trail + offset) as u8)
|
||||
}
|
||||
} else if bmp == 0xE5E5 {
|
||||
// It's not optimal to check for the unmappable
|
||||
// and for euro at this stage, but getting
|
||||
// the out of the way makes the rest of the
|
||||
// code less messy.
|
||||
return (EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
} else if bmp == 0x20AC && !self.extended {
|
||||
handle.write_one(0x80u8)
|
||||
} else {
|
||||
match gbk_encode_non_unified(bmp) {
|
||||
Some((lead, trail)) => handle.write_two(lead as u8, trail as u8),
|
||||
None => {
|
||||
if !self.extended {
|
||||
return (EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
let range_pointer = gb18030_range_encode(bmp);
|
||||
let first = range_pointer / (10 * 126 * 10);
|
||||
let rem_first = range_pointer % (10 * 126 * 10);
|
||||
let second = rem_first / (10 * 126);
|
||||
let rem_second = rem_first % (10 * 126);
|
||||
let third = rem_second / 10;
|
||||
let fourth = rem_second % 10;
|
||||
handle.write_four(
|
||||
(first + 0x81) as u8,
|
||||
(second + 0x30) as u8,
|
||||
(third + 0x81) as u8,
|
||||
(fourth + 0x30) as u8,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
if !self.extended {
|
||||
return (EncoderResult::Unmappable(astral), source.consumed(), handle.written());
|
||||
}
|
||||
let range_pointer = astral as usize + (189000usize - 0x10000usize);
|
||||
let first = range_pointer / (10 * 126 * 10);
|
||||
let rem_first = range_pointer % (10 * 126 * 10);
|
||||
let second = rem_first / (10 * 126);
|
||||
let rem_second = rem_first % (10 * 126);
|
||||
let third = rem_second / 10;
|
||||
let fourth = rem_second % 10;
|
||||
handle.write_four(
|
||||
(first + 0x81) as u8,
|
||||
(second + 0x30) as u8,
|
||||
(third + 0x81) as u8,
|
||||
(fourth + 0x30) as u8,
|
||||
)
|
||||
},
|
||||
bmp,
|
||||
astral,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_four,
|
||||
check_space_four,
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_gb18030(bytes: &[u8], expect: &str) {
|
||||
decode(GB18030, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_gb18030(string: &str, expect: &[u8]) {
|
||||
encode(GB18030, string, expect);
|
||||
}
|
||||
|
||||
fn encode_gbk(string: &str, expect: &[u8]) {
|
||||
encode(GBK, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gb18030_decode() {
|
||||
// Empty
|
||||
decode_gb18030(b"", &"");
|
||||
|
||||
// ASCII
|
||||
decode_gb18030(b"\x61\x62", "\u{0061}\u{0062}");
|
||||
|
||||
// euro
|
||||
decode_gb18030(b"\x80", "\u{20AC}");
|
||||
decode_gb18030(b"\xA2\xE3", "\u{20AC}");
|
||||
|
||||
// two bytes
|
||||
decode_gb18030(b"\x81\x40", "\u{4E02}");
|
||||
decode_gb18030(b"\x81\x7E", "\u{4E8A}");
|
||||
decode_gb18030(b"\x81\x7F", "\u{FFFD}\u{007F}");
|
||||
decode_gb18030(b"\x81\x80", "\u{4E90}");
|
||||
decode_gb18030(b"\x81\xFE", "\u{4FA2}");
|
||||
decode_gb18030(b"\xFE\x40", "\u{FA0C}");
|
||||
decode_gb18030(b"\xFE\x7E", "\u{E843}");
|
||||
decode_gb18030(b"\xFE\x7F", "\u{FFFD}\u{007F}");
|
||||
decode_gb18030(b"\xFE\x80", "\u{4723}");
|
||||
decode_gb18030(b"\xFE\xFE", "\u{E4C5}");
|
||||
|
||||
// The difference from the original GB18030
|
||||
decode_gb18030(b"\xA3\xA0", "\u{3000}");
|
||||
decode_gb18030(b"\xA1\xA1", "\u{3000}");
|
||||
|
||||
// 0xFF
|
||||
decode_gb18030(b"\xFF\x40", "\u{FFFD}\u{0040}");
|
||||
|
||||
// Four bytes
|
||||
decode_gb18030(b"\x81\x30\x81\x30", "\u{0080}");
|
||||
decode_gb18030(b"\x81\x35\xF4\x37", "\u{E7C7}");
|
||||
decode_gb18030(b"\x81\x37\xA3\x30", "\u{2603}");
|
||||
decode_gb18030(b"\x94\x39\xDA\x33", "\u{1F4A9}");
|
||||
decode_gb18030(b"\xE3\x32\x9A\x35", "\u{10FFFF}");
|
||||
decode_gb18030(b"\xE3\x32\x9A\x36\x81\x30", "\u{FFFD}\u{FFFD}");
|
||||
decode_gb18030(b"\xE3\x32\x9A\x36\x81\x40", "\u{FFFD}\u{4E02}");
|
||||
decode_gb18030(b"\xE3\x32\x9A", "\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD} !
|
||||
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gb18030_encode() {
|
||||
// Empty
|
||||
encode_gb18030("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_gb18030("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
// euro
|
||||
encode_gb18030("\u{20AC}", b"\xA2\xE3");
|
||||
|
||||
// two bytes
|
||||
encode_gb18030("\u{4E02}", b"\x81\x40");
|
||||
encode_gb18030("\u{4E8A}", b"\x81\x7E");
|
||||
encode_gb18030("\u{4E90}", b"\x81\x80");
|
||||
encode_gb18030("\u{4FA2}", b"\x81\xFE");
|
||||
encode_gb18030("\u{FA0C}", b"\xFE\x40");
|
||||
encode_gb18030("\u{E843}", b"\xFE\x7E");
|
||||
encode_gb18030("\u{4723}", b"\xFE\x80");
|
||||
encode_gb18030("\u{E4C5}", b"\xFE\xFE");
|
||||
|
||||
// The difference from the original GB18030
|
||||
encode_gb18030("\u{E5E5}", b"");
|
||||
encode_gb18030("\u{3000}", b"\xA1\xA1");
|
||||
|
||||
// Four bytes
|
||||
encode_gb18030("\u{0080}", b"\x81\x30\x81\x30");
|
||||
encode_gb18030("\u{E7C7}", b"\x81\x35\xF4\x37");
|
||||
encode_gb18030("\u{2603}", b"\x81\x37\xA3\x30");
|
||||
encode_gb18030("\u{1F4A9}", b"\x94\x39\xDA\x33");
|
||||
encode_gb18030("\u{10FFFF}", b"\xE3\x32\x9A\x35");
|
||||
|
||||
// Edge cases
|
||||
encode_gb18030("\u{00F7}", b"\xA1\xC2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gbk_encode() {
|
||||
// Empty
|
||||
encode_gbk("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_gbk("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
// euro
|
||||
encode_gbk("\u{20AC}", b"\x80");
|
||||
|
||||
// two bytes
|
||||
encode_gbk("\u{4E02}", b"\x81\x40");
|
||||
encode_gbk("\u{4E8A}", b"\x81\x7E");
|
||||
encode_gbk("\u{4E90}", b"\x81\x80");
|
||||
encode_gbk("\u{4FA2}", b"\x81\xFE");
|
||||
encode_gbk("\u{FA0C}", b"\xFE\x40");
|
||||
encode_gbk("\u{E843}", b"\xFE\x7E");
|
||||
encode_gbk("\u{4723}", b"\xFE\x80");
|
||||
encode_gbk("\u{E4C5}", b"\xFE\xFE");
|
||||
|
||||
// The difference from the original gb18030
|
||||
encode_gbk("\u{E5E5}", b"");
|
||||
encode_gbk("\u{3000}", b"\xA1\xA1");
|
||||
|
||||
// Four bytes
|
||||
encode_gbk("\u{0080}", b"€");
|
||||
encode_gbk("\u{E7C7}", b"");
|
||||
encode_gbk("\u{2603}", b"☃");
|
||||
encode_gbk("\u{1F4A9}", b"💩");
|
||||
encode_gbk("\u{10FFFF}", b"");
|
||||
|
||||
// Edge cases
|
||||
encode_gbk("\u{00F7}", b"\xA1\xC2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gb18030_decode_all() {
|
||||
let input = include_bytes!("test_data/gb18030_in.txt");
|
||||
let expectation = include_str!("test_data/gb18030_in_ref.txt");
|
||||
let (cow, had_errors) = GB18030.decode_without_bom_handling(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gb18030_encode_all() {
|
||||
let input = include_str!("test_data/gb18030_out.txt");
|
||||
let expectation = include_bytes!("test_data/gb18030_out_ref.txt");
|
||||
let (cow, encoding, had_errors) = GB18030.encode(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(encoding, GB18030);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gb18030_encode_from_utf16_max_length() {
|
||||
let mut output = [0u8; 20];
|
||||
let mut encoder = GB18030.new_encoder();
|
||||
{
|
||||
let needed = encoder
|
||||
.max_buffer_length_from_utf16_without_replacement(1)
|
||||
.unwrap();
|
||||
let (result, read, written) =
|
||||
encoder
|
||||
.encode_from_utf16_without_replacement(&[0x3000], &mut output[..needed], true);
|
||||
assert_eq!(result, EncoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 2);
|
||||
assert_eq!(output[0], 0xA1);
|
||||
assert_eq!(output[1], 0xA1);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,966 @@
|
|||
// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use handles::*;
|
||||
use data::*;
|
||||
use variant::*;
|
||||
use super::*;
|
||||
// Rust 1.14.0 requires the following despite the asterisk above.
|
||||
use super::in_inclusive_range16;
|
||||
|
||||
#[derive(Copy,Clone)]
|
||||
enum Iso2022JpDecoderState {
|
||||
Ascii,
|
||||
Roman,
|
||||
Katakana,
|
||||
LeadByte,
|
||||
TrailByte,
|
||||
EscapeStart,
|
||||
Escape,
|
||||
}
|
||||
|
||||
pub struct Iso2022JpDecoder {
|
||||
decoder_state: Iso2022JpDecoderState,
|
||||
output_state: Iso2022JpDecoderState, // only takes 1 of first 4 values
|
||||
lead: u8,
|
||||
output_flag: bool,
|
||||
pending_prepended: bool,
|
||||
}
|
||||
|
||||
impl Iso2022JpDecoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::Iso2022Jp(
|
||||
Iso2022JpDecoder {
|
||||
decoder_state: Iso2022JpDecoderState::Ascii,
|
||||
output_state: Iso2022JpDecoderState::Ascii,
|
||||
lead: 0u8,
|
||||
output_flag: false,
|
||||
pending_prepended: false,
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
fn extra_to_input_from_state(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_add(
|
||||
if self.lead == 0 || self.pending_prepended {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
} +
|
||||
match self.decoder_state {
|
||||
Iso2022JpDecoderState::Escape |
|
||||
Iso2022JpDecoderState::EscapeStart => 1,
|
||||
_ => 0,
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
fn extra_to_output_from_state(&self) -> usize {
|
||||
if self.lead != 0 && self.pending_prepended {
|
||||
1 + self.output_flag as usize
|
||||
} else {
|
||||
self.output_flag as usize
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_add(
|
||||
self.extra_to_output_from_state(),
|
||||
self.extra_to_input_from_state(byte_length),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
// worst case: 1 to 3 (half-width katakana)
|
||||
self.max_utf8_buffer_length(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_mul(
|
||||
3,
|
||||
checked_add(
|
||||
self.extra_to_output_from_state(),
|
||||
self.extra_to_input_from_state(byte_length),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
decoder_functions!(
|
||||
{
|
||||
if self.pending_prepended {
|
||||
// lead was set in EscapeStart and "prepended"
|
||||
// in Escape.
|
||||
debug_assert!(self.lead == 0x24u8 || self.lead == 0x28u8);
|
||||
match dest.check_space_bmp() {
|
||||
Space::Full(_) => {
|
||||
return (DecoderResult::OutputFull, 0, 0);
|
||||
}
|
||||
Space::Available(destination_handle) => {
|
||||
self.pending_prepended = false;
|
||||
self.output_flag = false;
|
||||
match self.decoder_state {
|
||||
Iso2022JpDecoderState::Ascii |
|
||||
Iso2022JpDecoderState::Roman => {
|
||||
destination_handle.write_ascii(self.lead);
|
||||
self.lead = 0x0u8;
|
||||
}
|
||||
Iso2022JpDecoderState::Katakana => {
|
||||
destination_handle
|
||||
.write_upper_bmp(self.lead as u16 - 0x21u16 + 0xFF61u16);
|
||||
self.lead = 0x0u8;
|
||||
}
|
||||
Iso2022JpDecoderState::LeadByte => {
|
||||
self.decoder_state = Iso2022JpDecoderState::TrailByte;
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{},
|
||||
{
|
||||
match self.decoder_state {
|
||||
Iso2022JpDecoderState::TrailByte |
|
||||
Iso2022JpDecoderState::EscapeStart => {
|
||||
self.decoder_state = self.output_state;
|
||||
return (DecoderResult::Malformed(1, 0), src_consumed, dest.written());
|
||||
}
|
||||
Iso2022JpDecoderState::Escape => {
|
||||
self.pending_prepended = true;
|
||||
self.decoder_state = self.output_state;
|
||||
return (DecoderResult::Malformed(1, 1), src_consumed, dest.written());
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
},
|
||||
{
|
||||
match self.decoder_state {
|
||||
Iso2022JpDecoderState::Ascii => {
|
||||
if b == 0x1Bu8 {
|
||||
self.decoder_state = Iso2022JpDecoderState::EscapeStart;
|
||||
continue;
|
||||
}
|
||||
self.output_flag = false;
|
||||
if b > 0x7Fu8 || b == 0x0Eu8 || b == 0x0Fu8 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written());
|
||||
}
|
||||
destination_handle.write_ascii(b);
|
||||
continue;
|
||||
}
|
||||
Iso2022JpDecoderState::Roman => {
|
||||
if b == 0x1Bu8 {
|
||||
self.decoder_state = Iso2022JpDecoderState::EscapeStart;
|
||||
continue;
|
||||
}
|
||||
self.output_flag = false;
|
||||
if b == 0x5Cu8 {
|
||||
destination_handle.write_mid_bmp(0x00A5u16);
|
||||
continue;
|
||||
}
|
||||
if b == 0x7Eu8 {
|
||||
destination_handle.write_upper_bmp(0x203Eu16);
|
||||
continue;
|
||||
}
|
||||
if b > 0x7Fu8 || b == 0x0Eu8 || b == 0x0Fu8 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written());
|
||||
}
|
||||
destination_handle.write_ascii(b);
|
||||
continue;
|
||||
}
|
||||
Iso2022JpDecoderState::Katakana => {
|
||||
if b == 0x1Bu8 {
|
||||
self.decoder_state = Iso2022JpDecoderState::EscapeStart;
|
||||
continue;
|
||||
}
|
||||
self.output_flag = false;
|
||||
if b >= 0x21u8 && b <= 0x5Fu8 {
|
||||
destination_handle.write_upper_bmp(b as u16 - 0x21u16 + 0xFF61u16);
|
||||
continue;
|
||||
}
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written());
|
||||
}
|
||||
Iso2022JpDecoderState::LeadByte => {
|
||||
if b == 0x1Bu8 {
|
||||
self.decoder_state = Iso2022JpDecoderState::EscapeStart;
|
||||
continue;
|
||||
}
|
||||
self.output_flag = false;
|
||||
if b >= 0x21u8 && b <= 0x7Eu8 {
|
||||
self.lead = b;
|
||||
self.decoder_state = Iso2022JpDecoderState::TrailByte;
|
||||
continue;
|
||||
}
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written());
|
||||
}
|
||||
Iso2022JpDecoderState::TrailByte => {
|
||||
if b == 0x1Bu8 {
|
||||
self.decoder_state = Iso2022JpDecoderState::EscapeStart;
|
||||
// The byte in error is the previous
|
||||
// lead byte.
|
||||
return (DecoderResult::Malformed(1, 1),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written());
|
||||
}
|
||||
self.decoder_state = Iso2022JpDecoderState::LeadByte;
|
||||
let jis0208_lead_minus_offset = self.lead - 0x21;
|
||||
let byte = b;
|
||||
let handle = destination_handle;
|
||||
// The code below uses else after continue in
|
||||
// order to retain the structure seen in EUC-JP.
|
||||
let trail_minus_offset = byte.wrapping_sub(0x21);
|
||||
// Fast-track Hiragana (60% according to Lunde)
|
||||
// and Katakana (10% acconding to Lunde).
|
||||
if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 {
|
||||
// Hiragana
|
||||
handle.write_upper_bmp(0x3041 + trail_minus_offset as u16);
|
||||
continue;
|
||||
} else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 {
|
||||
// Katakana
|
||||
handle.write_upper_bmp(0x30A1 + trail_minus_offset as u16);
|
||||
continue;
|
||||
} else if trail_minus_offset > (0xFE - 0xA1) {
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle.consumed(),
|
||||
handle.written());
|
||||
} else {
|
||||
let pointer = mul_94(jis0208_lead_minus_offset) +
|
||||
trail_minus_offset as usize;
|
||||
let level1_pointer = pointer.wrapping_sub(1410);
|
||||
if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer]);
|
||||
continue;
|
||||
} else {
|
||||
let level2_pointer = pointer.wrapping_sub(4418);
|
||||
if level2_pointer < JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
|
||||
handle.write_upper_bmp(
|
||||
JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer],
|
||||
);
|
||||
continue;
|
||||
} else {
|
||||
let ibm_pointer = pointer.wrapping_sub(8272);
|
||||
if ibm_pointer < IBM_KANJI.len() {
|
||||
handle.write_upper_bmp(IBM_KANJI[ibm_pointer]);
|
||||
continue;
|
||||
} else if let Some(bmp) = jis0208_symbol_decode(pointer) {
|
||||
handle.write_bmp_excl_ascii(bmp);
|
||||
continue;
|
||||
} else if let Some(bmp) = jis0208_range_decode(pointer) {
|
||||
handle.write_bmp_excl_ascii(bmp);
|
||||
continue;
|
||||
} else {
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Iso2022JpDecoderState::EscapeStart => {
|
||||
if b == 0x24u8 || b == 0x28u8 {
|
||||
self.lead = b;
|
||||
self.decoder_state = Iso2022JpDecoderState::Escape;
|
||||
continue;
|
||||
}
|
||||
self.output_flag = false;
|
||||
self.decoder_state = self.output_state;
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle.unread(),
|
||||
destination_handle.written());
|
||||
}
|
||||
Iso2022JpDecoderState::Escape => {
|
||||
let mut state: Option<Iso2022JpDecoderState> = None;
|
||||
if self.lead == 0x28u8 && b == 0x42u8 {
|
||||
state = Some(Iso2022JpDecoderState::Ascii);
|
||||
} else if self.lead == 0x28u8 && b == 0x4Au8 {
|
||||
state = Some(Iso2022JpDecoderState::Roman);
|
||||
} else if self.lead == 0x28u8 && b == 0x49u8 {
|
||||
state = Some(Iso2022JpDecoderState::Katakana);
|
||||
} else if self.lead == 0x24u8 && (b == 0x40u8 || b == 0x42u8) {
|
||||
state = Some(Iso2022JpDecoderState::LeadByte);
|
||||
}
|
||||
match state {
|
||||
Some(s) => {
|
||||
self.lead = 0x0u8;
|
||||
self.decoder_state = s;
|
||||
self.output_state = s;
|
||||
let flag = self.output_flag;
|
||||
self.output_flag = true;
|
||||
if flag {
|
||||
// We had an escape sequence
|
||||
// immediately following another
|
||||
// escape sequence. Therefore,
|
||||
// the first one of these was
|
||||
// useless.
|
||||
return (DecoderResult::Malformed(3, 3),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
None => {
|
||||
// self.lead is still the previous
|
||||
// byte. It will be processed in
|
||||
// the preabmle upon next call.
|
||||
self.pending_prepended = true;
|
||||
self.output_flag = false;
|
||||
self.decoder_state = self.output_state;
|
||||
// The byte in error is not the
|
||||
// current or the previous byte but
|
||||
// the one before those (lone 0x1B).
|
||||
return (DecoderResult::Malformed(1, 1),
|
||||
unread_handle.unread(),
|
||||
destination_handle.written());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
self,
|
||||
src_consumed,
|
||||
dest,
|
||||
source,
|
||||
b,
|
||||
destination_handle,
|
||||
unread_handle,
|
||||
check_space_bmp
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(if_let_redundant_pattern_matching, if_same_then_else))]
|
||||
fn is_mapped_for_two_byte_encode(bmp: u16) -> bool {
|
||||
// The code below uses else after return to
|
||||
// keep the same structure as in EUC-JP.
|
||||
// Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
|
||||
let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
|
||||
if bmp_minus_hiragana < 0x53 {
|
||||
true
|
||||
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
|
||||
if 0x4EDD == bmp {
|
||||
true
|
||||
} else if let Some(_) = jis0208_level1_kanji_shift_jis_encode(bmp) {
|
||||
// Use the shift_jis variant, because we don't care about the
|
||||
// byte values here.
|
||||
true
|
||||
} else if let Some(_) = jis0208_level2_and_additional_kanji_encode(bmp) {
|
||||
true
|
||||
} else if let Some(_) = position(&IBM_KANJI[..], bmp) {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
|
||||
if bmp_minus_katakana < 0x56 {
|
||||
true
|
||||
} else {
|
||||
let bmp_minus_space = bmp.wrapping_sub(0x3000);
|
||||
if bmp_minus_space < 3 {
|
||||
// fast-track common punctuation
|
||||
true
|
||||
} else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
|
||||
true
|
||||
} else if bmp == 0x2212 {
|
||||
true
|
||||
} else if let Some(_) = jis0208_range_encode(bmp) {
|
||||
true
|
||||
} else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) || bmp == 0xF929 || bmp == 0xF9DC {
|
||||
true
|
||||
} else if let Some(_) = ibm_symbol_encode(bmp) {
|
||||
true
|
||||
} else if let Some(_) = jis0208_symbol_encode(bmp) {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum Iso2022JpEncoderState {
|
||||
Ascii,
|
||||
Roman,
|
||||
Jis0208,
|
||||
}
|
||||
|
||||
pub struct Iso2022JpEncoder {
|
||||
state: Iso2022JpEncoderState,
|
||||
}
|
||||
|
||||
impl Iso2022JpEncoder {
|
||||
pub fn new(encoding: &'static Encoding) -> Encoder {
|
||||
Encoder::new(
|
||||
encoding,
|
||||
VariantEncoder::Iso2022Jp(Iso2022JpEncoder { state: Iso2022JpEncoderState::Ascii }),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn has_pending_state(&self) -> bool {
|
||||
match self.state {
|
||||
Iso2022JpEncoderState::Ascii => false,
|
||||
_ => true,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(&self,
|
||||
u16_length: usize)
|
||||
-> Option<usize> {
|
||||
// Worst case: every other character is ASCII/Roman and every other
|
||||
// JIS0208.
|
||||
// Two UTF-16 input units:
|
||||
// Transition to Roman: 3
|
||||
// Roman/ASCII: 1
|
||||
// Transition to JIS0208: 3
|
||||
// JIS0208: 2
|
||||
// End transition: 3
|
||||
checked_add_opt(
|
||||
checked_add(3, u16_length.checked_mul(4)),
|
||||
checked_div(u16_length.checked_add(1), 2),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(&self,
|
||||
byte_length: usize)
|
||||
-> Option<usize> {
|
||||
// Worst case: every other character is ASCII/Roman and every other
|
||||
// JIS0208.
|
||||
// Three UTF-8 input units: 1 ASCII, 2 JIS0208
|
||||
// Transition to ASCII: 3
|
||||
// Roman/ASCII: 1
|
||||
// Transition to JIS0208: 3
|
||||
// JIS0208: 2
|
||||
// End transition: 3
|
||||
checked_add(3, byte_length.checked_mul(3))
|
||||
}
|
||||
|
||||
encoder_functions!(
|
||||
{
|
||||
match self.state {
|
||||
Iso2022JpEncoderState::Ascii => {}
|
||||
_ => {
|
||||
match dest.check_space_three() {
|
||||
Space::Full(dst_written) => {
|
||||
return (EncoderResult::OutputFull, src_consumed, dst_written);
|
||||
}
|
||||
Space::Available(destination_handle) => {
|
||||
self.state = Iso2022JpEncoderState::Ascii;
|
||||
destination_handle.write_three(0x1Bu8, 0x28u8, 0x42u8);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
match self.state {
|
||||
Iso2022JpEncoderState::Ascii => {
|
||||
if c == '\u{0E}' || c == '\u{0F}' || c == '\u{1B}' {
|
||||
return (EncoderResult::Unmappable('\u{FFFD}'),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written());
|
||||
}
|
||||
if c <= '\u{7F}' {
|
||||
destination_handle.write_one(c as u8);
|
||||
continue;
|
||||
}
|
||||
if c == '\u{A5}' || c == '\u{203E}' {
|
||||
self.state = Iso2022JpEncoderState::Roman;
|
||||
destination_handle.write_three(0x1Bu8, 0x28u8, 0x4Au8);
|
||||
unread_handle.unread();
|
||||
continue;
|
||||
}
|
||||
if c > '\u{FFFF}' {
|
||||
return (EncoderResult::Unmappable(c),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written());
|
||||
}
|
||||
// Yes, if c is in index, we'll search
|
||||
// again in the Jis0208 state, but this
|
||||
// encoder is not worth optimizing.
|
||||
if is_mapped_for_two_byte_encode(c as u16) {
|
||||
self.state = Iso2022JpEncoderState::Jis0208;
|
||||
destination_handle.write_three(0x1Bu8, 0x24u8, 0x42u8);
|
||||
unread_handle.unread();
|
||||
continue;
|
||||
}
|
||||
return (EncoderResult::Unmappable(c),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written());
|
||||
}
|
||||
Iso2022JpEncoderState::Roman => {
|
||||
if c == '\u{0E}' || c == '\u{0F}' || c == '\u{1B}' {
|
||||
return (EncoderResult::Unmappable('\u{FFFD}'),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written());
|
||||
}
|
||||
if c == '\u{5C}' || c == '\u{7E}' {
|
||||
self.state = Iso2022JpEncoderState::Ascii;
|
||||
destination_handle.write_three(0x1Bu8, 0x28u8, 0x42u8);
|
||||
unread_handle.unread();
|
||||
continue;
|
||||
}
|
||||
if c <= '\u{7F}' {
|
||||
destination_handle.write_one(c as u8);
|
||||
continue;
|
||||
}
|
||||
if c == '\u{A5}' {
|
||||
destination_handle.write_one(0x5Cu8);
|
||||
continue;
|
||||
}
|
||||
if c == '\u{203E}' {
|
||||
destination_handle.write_one(0x7Eu8);
|
||||
continue;
|
||||
}
|
||||
if c > '\u{FFFF}' {
|
||||
return (EncoderResult::Unmappable(c),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written());
|
||||
}
|
||||
// Yes, if c is in index, we'll search
|
||||
// again in the Jis0208 state, but this
|
||||
// encoder is not worth optimizing.
|
||||
if is_mapped_for_two_byte_encode(c as u16) {
|
||||
self.state = Iso2022JpEncoderState::Jis0208;
|
||||
destination_handle.write_three(0x1Bu8, 0x24u8, 0x42u8);
|
||||
unread_handle.unread();
|
||||
continue;
|
||||
}
|
||||
return (EncoderResult::Unmappable(c),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written());
|
||||
}
|
||||
Iso2022JpEncoderState::Jis0208 => {
|
||||
if c <= '\u{7F}' {
|
||||
self.state = Iso2022JpEncoderState::Ascii;
|
||||
destination_handle.write_three(0x1Bu8, 0x28u8, 0x42u8);
|
||||
unread_handle.unread();
|
||||
continue;
|
||||
}
|
||||
if c == '\u{A5}' || c == '\u{203E}' {
|
||||
self.state = Iso2022JpEncoderState::Roman;
|
||||
destination_handle.write_three(0x1Bu8, 0x28u8, 0x4Au8);
|
||||
unread_handle.unread();
|
||||
continue;
|
||||
}
|
||||
if c > '\u{FFFF}' {
|
||||
// Transition to ASCII here in order
|
||||
// not to make it the responsibility
|
||||
// of the caller.
|
||||
self.state = Iso2022JpEncoderState::Ascii;
|
||||
return (EncoderResult::Unmappable(c),
|
||||
unread_handle.consumed(),
|
||||
destination_handle
|
||||
.write_three_return_written(0x1Bu8, 0x28u8, 0x42u8));
|
||||
}
|
||||
let bmp = c as u16;
|
||||
let handle = destination_handle;
|
||||
// The code below uses else after continue to
|
||||
// keep the same structure as in EUC-JP.
|
||||
// Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
|
||||
let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
|
||||
if bmp_minus_hiragana < 0x53 {
|
||||
handle.write_two(0x24, 0x21 + bmp_minus_hiragana as u8);
|
||||
continue;
|
||||
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
|
||||
if 0x4EDD == bmp {
|
||||
// Ideograph on the symbol row!
|
||||
handle.write_two(0x21, 0xB8 - 0x80);
|
||||
continue;
|
||||
} else if let Some((lead, trail)) =
|
||||
jis0208_level1_kanji_iso_2022_jp_encode(bmp) {
|
||||
handle.write_two(lead, trail);
|
||||
continue;
|
||||
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
|
||||
let lead = (pos / 94) + (0xD0 - 0x80);
|
||||
let trail = (pos % 94) + 0x21;
|
||||
handle.write_two(lead as u8, trail as u8);
|
||||
continue;
|
||||
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
|
||||
let lead = (pos / 94) + (0xF9 - 0x80);
|
||||
let trail = (pos % 94) + 0x21;
|
||||
handle.write_two(lead as u8, trail as u8);
|
||||
continue;
|
||||
} else {
|
||||
self.state = Iso2022JpEncoderState::Ascii;
|
||||
return (EncoderResult::Unmappable(c),
|
||||
unread_handle.consumed(),
|
||||
handle.write_three_return_written(0x1Bu8, 0x28u8, 0x42u8));
|
||||
}
|
||||
} else {
|
||||
let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
|
||||
if bmp_minus_katakana < 0x56 {
|
||||
handle.write_two(0x25, 0x21 + bmp_minus_katakana as u8);
|
||||
continue;
|
||||
} else {
|
||||
let bmp_minus_space = bmp.wrapping_sub(0x3000);
|
||||
if bmp_minus_space < 3 {
|
||||
// fast-track common punctuation
|
||||
handle.write_two(0x21, 0x21 + bmp_minus_space as u8);
|
||||
continue;
|
||||
}
|
||||
let bmp_minus_half_width = bmp.wrapping_sub(0xFF61);
|
||||
if bmp_minus_half_width <= (0xFF9F - 0xFF61) {
|
||||
// We have half-width katakana. The lead is either
|
||||
// row 1 or 5 of JIS X 0208, so the lookup table
|
||||
// only stores the trail.
|
||||
let lead = if bmp != 0xFF70 &&
|
||||
in_inclusive_range16(bmp, 0xFF66, 0xFF9D) {
|
||||
0x25u8
|
||||
} else {
|
||||
0x21u8
|
||||
};
|
||||
let trail = ISO_2022_JP_HALF_WIDTH_TRAIL[bmp_minus_half_width as
|
||||
usize];
|
||||
handle.write_two(lead, trail);
|
||||
continue;
|
||||
} else if bmp == 0x2212 {
|
||||
handle.write_two(0x21, 0x5D);
|
||||
continue;
|
||||
} else if let Some(pointer) = jis0208_range_encode(bmp) {
|
||||
let lead = (pointer / 94) + 0x21;
|
||||
let trail = (pointer % 94) + 0x21;
|
||||
handle.write_two(lead as u8, trail as u8);
|
||||
continue;
|
||||
} else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) || bmp == 0xF929 ||
|
||||
bmp == 0xF9DC {
|
||||
// Guaranteed to be found in IBM_KANJI
|
||||
let pos = position(&IBM_KANJI[..], bmp).unwrap();
|
||||
let lead = (pos / 94) + (0xF9 - 0x80);
|
||||
let trail = (pos % 94) + 0x21;
|
||||
handle.write_two(lead as u8, trail as u8);
|
||||
continue;
|
||||
} else if let Some(pointer) = ibm_symbol_encode(bmp) {
|
||||
let lead = (pointer / 94) + 0x21;
|
||||
let trail = (pointer % 94) + 0x21;
|
||||
handle.write_two(lead as u8, trail as u8);
|
||||
continue;
|
||||
} else if let Some(pointer) = jis0208_symbol_encode(bmp) {
|
||||
let lead = (pointer / 94) + 0x21;
|
||||
let trail = (pointer % 94) + 0x21;
|
||||
handle.write_two(lead as u8, trail as u8);
|
||||
continue;
|
||||
} else {
|
||||
self.state = Iso2022JpEncoderState::Ascii;
|
||||
return (EncoderResult::Unmappable(c),
|
||||
unread_handle.consumed(),
|
||||
handle.write_three_return_written(0x1Bu8, 0x28u8, 0x42u8));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
self,
|
||||
src_consumed,
|
||||
source,
|
||||
dest,
|
||||
c,
|
||||
destination_handle,
|
||||
unread_handle,
|
||||
check_space_three
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_iso_2022_jp(bytes: &[u8], expect: &str) {
|
||||
decode(ISO_2022_JP, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_iso_2022_jp(string: &str, expect: &[u8]) {
|
||||
encode(ISO_2022_JP, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_iso_2022_jp_decode() {
|
||||
// Empty
|
||||
decode_iso_2022_jp(b"", &"");
|
||||
|
||||
// ASCII
|
||||
decode_iso_2022_jp(b"\x61\x62", "\u{0061}\u{0062}");
|
||||
decode_iso_2022_jp(b"\x7F\x0E\x0F", "\u{007F}\u{FFFD}\u{FFFD}");
|
||||
|
||||
// Partial escapes
|
||||
decode_iso_2022_jp(b"\x1B", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B$", "\u{FFFD}$");
|
||||
decode_iso_2022_jp(b"\x1B(", "\u{FFFD}(");
|
||||
decode_iso_2022_jp(b"\x1B.", "\u{FFFD}.");
|
||||
|
||||
// ISO escapes
|
||||
decode_iso_2022_jp(b"\x1B(B", ""); // ASCII
|
||||
decode_iso_2022_jp(b"\x1B(J", ""); // Roman
|
||||
decode_iso_2022_jp(b"\x1B$@", ""); // 0208
|
||||
decode_iso_2022_jp(b"\x1B$B", ""); // 0208
|
||||
decode_iso_2022_jp(b"\x1B$(D", "\u{FFFD}$(D"); // 2012
|
||||
decode_iso_2022_jp(b"\x1B$A", "\u{FFFD}$A"); // GB2312
|
||||
decode_iso_2022_jp(b"\x1B$(C", "\u{FFFD}$(C"); // KR
|
||||
decode_iso_2022_jp(b"\x1B.A", "\u{FFFD}.A"); // Latin-1
|
||||
decode_iso_2022_jp(b"\x1B.F", "\u{FFFD}.F"); // Greek
|
||||
decode_iso_2022_jp(b"\x1B(I", ""); // Half-width Katakana
|
||||
decode_iso_2022_jp(b"\x1B$(O", "\u{FFFD}$(O"); // 2013
|
||||
decode_iso_2022_jp(b"\x1B$(P", "\u{FFFD}$(P"); // 2013
|
||||
decode_iso_2022_jp(b"\x1B$(Q", "\u{FFFD}$(Q"); // 2013
|
||||
decode_iso_2022_jp(b"\x1B$)C", "\u{FFFD}$)C"); // KR
|
||||
decode_iso_2022_jp(b"\x1B$)A", "\u{FFFD}$)A"); // GB2312
|
||||
decode_iso_2022_jp(b"\x1B$)G", "\u{FFFD}$)G"); // CNS
|
||||
decode_iso_2022_jp(b"\x1B$*H", "\u{FFFD}$*H"); // CNS
|
||||
decode_iso_2022_jp(b"\x1B$)E", "\u{FFFD}$)E"); // IR
|
||||
decode_iso_2022_jp(b"\x1B$+I", "\u{FFFD}$+I"); // CNS
|
||||
decode_iso_2022_jp(b"\x1B$+J", "\u{FFFD}$+J"); // CNS
|
||||
decode_iso_2022_jp(b"\x1B$+K", "\u{FFFD}$+K"); // CNS
|
||||
decode_iso_2022_jp(b"\x1B$+L", "\u{FFFD}$+L"); // CNS
|
||||
decode_iso_2022_jp(b"\x1B$+M", "\u{FFFD}$+M"); // CNS
|
||||
decode_iso_2022_jp(b"\x1B$(@", "\u{FFFD}$(@"); // 0208
|
||||
decode_iso_2022_jp(b"\x1B$(A", "\u{FFFD}$(A"); // GB2312
|
||||
decode_iso_2022_jp(b"\x1B$(B", "\u{FFFD}$(B"); // 0208
|
||||
decode_iso_2022_jp(b"\x1B%G", "\u{FFFD}%G"); // UTF-8
|
||||
|
||||
// ASCII
|
||||
decode_iso_2022_jp(b"\x5B", "\u{005B}");
|
||||
decode_iso_2022_jp(b"\x5C", "\u{005C}");
|
||||
decode_iso_2022_jp(b"\x7E", "\u{007E}");
|
||||
decode_iso_2022_jp(b"\x0E", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x0F", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x80", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\xFF", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(B\x5B", "\u{005B}");
|
||||
decode_iso_2022_jp(b"\x1B(B\x5C", "\u{005C}");
|
||||
decode_iso_2022_jp(b"\x1B(B\x7E", "\u{007E}");
|
||||
decode_iso_2022_jp(b"\x1B(B\x0E", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(B\x0F", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(B\x80", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(B\xFF", "\u{FFFD}");
|
||||
|
||||
// Roman
|
||||
decode_iso_2022_jp(b"\x1B(J\x5B", "\u{005B}");
|
||||
decode_iso_2022_jp(b"\x1B(J\x5C", "\u{00A5}");
|
||||
decode_iso_2022_jp(b"\x1B(J\x7E", "\u{203E}");
|
||||
decode_iso_2022_jp(b"\x1B(J\x0E", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(J\x0F", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(J\x80", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(J\xFF", "\u{FFFD}");
|
||||
|
||||
// Katakana
|
||||
decode_iso_2022_jp(b"\x1B(I\x20", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(I\x21", "\u{FF61}");
|
||||
decode_iso_2022_jp(b"\x1B(I\x5F", "\u{FF9F}");
|
||||
decode_iso_2022_jp(b"\x1B(I\x60", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(I\x0E", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(I\x0F", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(I\x80", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(I\xFF", "\u{FFFD}");
|
||||
|
||||
// 0208 differences from 1978 to 1983
|
||||
decode_iso_2022_jp(b"\x1B$@\x54\x64", "\u{58FA}");
|
||||
decode_iso_2022_jp(b"\x1B$@\x44\x5B", "\u{58F7}");
|
||||
decode_iso_2022_jp(b"\x1B$@\x74\x21", "\u{582F}");
|
||||
decode_iso_2022_jp(b"\x1B$@\x36\x46", "\u{5C2D}");
|
||||
decode_iso_2022_jp(b"\x1B$@\x28\x2E", "\u{250F}");
|
||||
decode_iso_2022_jp(b"\x1B$B\x54\x64", "\u{58FA}");
|
||||
decode_iso_2022_jp(b"\x1B$B\x44\x5B", "\u{58F7}");
|
||||
decode_iso_2022_jp(b"\x1B$B\x74\x21", "\u{582F}");
|
||||
decode_iso_2022_jp(b"\x1B$B\x36\x46", "\u{5C2D}");
|
||||
decode_iso_2022_jp(b"\x1B$B\x28\x2E", "\u{250F}");
|
||||
|
||||
// Broken 0208
|
||||
decode_iso_2022_jp(b"\x1B$B\x28\x41", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B$@\x80\x54\x64", "\u{FFFD}\u{58FA}");
|
||||
decode_iso_2022_jp(b"\x1B$B\x28\x80", "\u{FFFD}");
|
||||
|
||||
// Transitions
|
||||
decode_iso_2022_jp(b"\x1B(B\x5C\x1B(J\x5C", "\u{005C}\u{00A5}");
|
||||
decode_iso_2022_jp(b"\x1B(B\x5C\x1B(I\x21", "\u{005C}\u{FF61}");
|
||||
decode_iso_2022_jp(b"\x1B(B\x5C\x1B$@\x54\x64", "\u{005C}\u{58FA}");
|
||||
decode_iso_2022_jp(b"\x1B(B\x5C\x1B$B\x54\x64", "\u{005C}\u{58FA}");
|
||||
|
||||
decode_iso_2022_jp(b"\x1B(J\x5C\x1B(B\x5C", "\u{00A5}\u{005C}");
|
||||
decode_iso_2022_jp(b"\x1B(J\x5C\x1B(I\x21", "\u{00A5}\u{FF61}");
|
||||
decode_iso_2022_jp(b"\x1B(J\x5C\x1B$@\x54\x64", "\u{00A5}\u{58FA}");
|
||||
decode_iso_2022_jp(b"\x1B(J\x5C\x1B$B\x54\x64", "\u{00A5}\u{58FA}");
|
||||
|
||||
decode_iso_2022_jp(b"\x1B(I\x21\x1B(J\x5C", "\u{FF61}\u{00A5}");
|
||||
decode_iso_2022_jp(b"\x1B(I\x21\x1B(B\x5C", "\u{FF61}\u{005C}");
|
||||
decode_iso_2022_jp(b"\x1B(I\x21\x1B$@\x54\x64", "\u{FF61}\u{58FA}");
|
||||
decode_iso_2022_jp(b"\x1B(I\x21\x1B$B\x54\x64", "\u{FF61}\u{58FA}");
|
||||
|
||||
decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B(J\x5C", "\u{58FA}\u{00A5}");
|
||||
decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B(I\x21", "\u{58FA}\u{FF61}");
|
||||
decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B(B\x5C", "\u{58FA}\u{005C}");
|
||||
decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B$B\x54\x64", "\u{58FA}\u{58FA}");
|
||||
|
||||
decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B(J\x5C", "\u{58FA}\u{00A5}");
|
||||
decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B(I\x21", "\u{58FA}\u{FF61}");
|
||||
decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B$@\x54\x64", "\u{58FA}\u{58FA}");
|
||||
decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B(B\x5C", "\u{58FA}\u{005C}");
|
||||
|
||||
// Empty transitions
|
||||
decode_iso_2022_jp(b"\x1B(B\x1B(J", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(B\x1B(I", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(B\x1B$@", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(B\x1B$B", "\u{FFFD}");
|
||||
|
||||
decode_iso_2022_jp(b"\x1B(J\x1B(B", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(J\x1B(I", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(J\x1B$@", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(J\x1B$B", "\u{FFFD}");
|
||||
|
||||
decode_iso_2022_jp(b"\x1B(I\x1B(J", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(I\x1B(B", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(I\x1B$@", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B(I\x1B$B", "\u{FFFD}");
|
||||
|
||||
decode_iso_2022_jp(b"\x1B$@\x1B(J", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B$@\x1B(I", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B$@\x1B(B", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B$@\x1B$B", "\u{FFFD}");
|
||||
|
||||
decode_iso_2022_jp(b"\x1B$B\x1B(J", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B$B\x1B(I", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B$B\x1B$@", "\u{FFFD}");
|
||||
decode_iso_2022_jp(b"\x1B$B\x1B(B", "\u{FFFD}");
|
||||
|
||||
// Transitions to self
|
||||
decode_iso_2022_jp(b"\x1B(B\x5C\x1B(B\x5C", "\u{005C}\u{005C}");
|
||||
decode_iso_2022_jp(b"\x1B(J\x5C\x1B(J\x5C", "\u{00A5}\u{00A5}");
|
||||
decode_iso_2022_jp(b"\x1B(I\x21\x1B(I\x21", "\u{FF61}\u{FF61}");
|
||||
decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B$@\x54\x64", "\u{58FA}\u{58FA}");
|
||||
decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B$B\x54\x64", "\u{58FA}\u{58FA}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_iso_2022_jp_encode() {
|
||||
// Empty
|
||||
encode_iso_2022_jp("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_iso_2022_jp("ab", b"ab");
|
||||
encode_iso_2022_jp("\u{1F4A9}", b"💩");
|
||||
encode_iso_2022_jp("\x1B", b"�");
|
||||
encode_iso_2022_jp("\x0E", b"�");
|
||||
encode_iso_2022_jp("\x0F", b"�");
|
||||
|
||||
// Roman
|
||||
encode_iso_2022_jp("a\u{00A5}b", b"a\x1B(J\x5Cb\x1B(B");
|
||||
encode_iso_2022_jp("a\u{203E}b", b"a\x1B(J\x7Eb\x1B(B");
|
||||
encode_iso_2022_jp("a\u{00A5}b\x5C", b"a\x1B(J\x5Cb\x1B(B\x5C");
|
||||
encode_iso_2022_jp("a\u{203E}b\x7E", b"a\x1B(J\x7Eb\x1B(B\x7E");
|
||||
encode_iso_2022_jp("\u{00A5}\u{1F4A9}", b"\x1B(J\x5C💩\x1B(B");
|
||||
encode_iso_2022_jp("\u{00A5}\x1B", b"\x1B(J\x5C�\x1B(B");
|
||||
encode_iso_2022_jp("\u{00A5}\x0E", b"\x1B(J\x5C�\x1B(B");
|
||||
encode_iso_2022_jp("\u{00A5}\x0F", b"\x1B(J\x5C�\x1B(B");
|
||||
encode_iso_2022_jp("\u{00A5}\u{58FA}", b"\x1B(J\x5C\x1B$B\x54\x64\x1B(B");
|
||||
|
||||
// Half-width Katakana
|
||||
encode_iso_2022_jp("\u{FF61}", b"\x1B$B\x21\x23\x1B(B");
|
||||
encode_iso_2022_jp("\u{FF65}", b"\x1B$B\x21\x26\x1B(B");
|
||||
encode_iso_2022_jp("\u{FF66}", b"\x1B$B\x25\x72\x1B(B");
|
||||
encode_iso_2022_jp("\u{FF70}", b"\x1B$B\x21\x3C\x1B(B");
|
||||
encode_iso_2022_jp("\u{FF9D}", b"\x1B$B\x25\x73\x1B(B");
|
||||
encode_iso_2022_jp("\u{FF9E}", b"\x1B$B\x21\x2B\x1B(B");
|
||||
encode_iso_2022_jp("\u{FF9F}", b"\x1B$B\x21\x2C\x1B(B");
|
||||
|
||||
// 0208
|
||||
encode_iso_2022_jp("\u{58FA}", b"\x1B$B\x54\x64\x1B(B");
|
||||
encode_iso_2022_jp("\u{58FA}\u{250F}", b"\x1B$B\x54\x64\x28\x2E\x1B(B");
|
||||
encode_iso_2022_jp("\u{58FA}\u{1F4A9}", b"\x1B$B\x54\x64\x1B(B💩");
|
||||
encode_iso_2022_jp("\u{58FA}\x1B", b"\x1B$B\x54\x64\x1B(B�");
|
||||
encode_iso_2022_jp("\u{58FA}\x0E", b"\x1B$B\x54\x64\x1B(B�");
|
||||
encode_iso_2022_jp("\u{58FA}\x0F", b"\x1B$B\x54\x64\x1B(B�");
|
||||
encode_iso_2022_jp("\u{58FA}\u{00A5}", b"\x1B$B\x54\x64\x1B(J\x5C\x1B(B");
|
||||
encode_iso_2022_jp("\u{58FA}a", b"\x1B$B\x54\x64\x1B(Ba");
|
||||
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_iso_2022_jp_decode_all() {
|
||||
let input = include_bytes!("test_data/iso_2022_jp_in.txt");
|
||||
let expectation = include_str!("test_data/iso_2022_jp_in_ref.txt");
|
||||
let (cow, had_errors) = ISO_2022_JP.decode_without_bom_handling(input);
|
||||
assert!(had_errors, "Should have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_iso_2022_jp_encode_all() {
|
||||
let input = include_str!("test_data/iso_2022_jp_out.txt");
|
||||
let expectation = include_bytes!("test_data/iso_2022_jp_out_ref.txt");
|
||||
let (cow, encoding, had_errors) = ISO_2022_JP.encode(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(encoding, ISO_2022_JP);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_iso_2022_jp_half_width_katakana_length() {
|
||||
let mut output = [0u8; 20];
|
||||
let mut decoder = ISO_2022_JP.new_decoder();
|
||||
{
|
||||
let (result, read, written) =
|
||||
decoder.decode_to_utf8_without_replacement(b"\x1B\x28\x49", &mut output, false);
|
||||
assert_eq!(result, DecoderResult::InputEmpty);
|
||||
assert_eq!(read, 3);
|
||||
assert_eq!(written, 0);
|
||||
}
|
||||
{
|
||||
let needed = decoder
|
||||
.max_utf8_buffer_length_without_replacement(1)
|
||||
.unwrap();
|
||||
let (result, read, written) =
|
||||
decoder.decode_to_utf8_without_replacement(b"\x21", &mut output[..needed], true);
|
||||
assert_eq!(result, DecoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 3);
|
||||
assert_eq!(output[0], 0xEF);
|
||||
assert_eq!(output[1], 0xBD);
|
||||
assert_eq!(output[2], 0xA1);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_iso_2022_jp_length_after_escape() {
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = ISO_2022_JP.new_decoder();
|
||||
{
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\x1B", &mut output, false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 0);
|
||||
assert!(!had_errors);
|
||||
}
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"A", &mut output[..needed], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 2);
|
||||
assert!(had_errors);
|
||||
assert_eq!(output[0], 0xFFFD);
|
||||
assert_eq!(output[1], 0x0041);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,102 @@
|
|||
// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use variant::*;
|
||||
use super::*;
|
||||
|
||||
pub struct ReplacementDecoder {
|
||||
emitted: bool,
|
||||
}
|
||||
|
||||
impl ReplacementDecoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::Replacement(ReplacementDecoder { emitted: false })
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, _u16_length: usize) -> Option<usize> {
|
||||
Some(1)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, _byte_length: usize) -> Option<usize> {
|
||||
Some(3)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, _byte_length: usize) -> Option<usize> {
|
||||
Some(3)
|
||||
}
|
||||
|
||||
pub fn decode_to_utf16_raw(&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u16],
|
||||
_last: bool)
|
||||
-> (DecoderResult, usize, usize) {
|
||||
// Don't err if the input stream is empty. See
|
||||
// https://github.com/whatwg/encoding/issues/33
|
||||
if self.emitted || src.is_empty() {
|
||||
(DecoderResult::InputEmpty, src.len(), 0)
|
||||
} else if dst.len() < 1 {
|
||||
// Make sure there's room for the replacement character.
|
||||
(DecoderResult::OutputFull, 0, 0)
|
||||
} else {
|
||||
self.emitted = true;
|
||||
(DecoderResult::Malformed(1, 0), 1, 0)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8_raw(&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u8],
|
||||
_last: bool)
|
||||
-> (DecoderResult, usize, usize) {
|
||||
// Don't err if the input stream is empty. See
|
||||
// https://github.com/whatwg/encoding/issues/33
|
||||
if self.emitted || src.is_empty() {
|
||||
(DecoderResult::InputEmpty, src.len(), 0)
|
||||
} else if dst.len() < 3 {
|
||||
// Make sure there's room for the replacement character.
|
||||
(DecoderResult::OutputFull, 0, 0)
|
||||
} else {
|
||||
self.emitted = true;
|
||||
(DecoderResult::Malformed(1, 0), 1, 0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_replacement(bytes: &[u8], expect: &str) {
|
||||
decode_without_padding(REPLACEMENT, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_replacement(string: &str, expect: &[u8]) {
|
||||
encode(REPLACEMENT, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_replacement_decode() {
|
||||
decode_replacement(b"", "");
|
||||
decode_replacement(b"A", "\u{FFFD}");
|
||||
decode_replacement(b"AB", "\u{FFFD}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_replacement_encode() {
|
||||
// Empty
|
||||
encode_replacement("", b"");
|
||||
|
||||
assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
|
||||
encode_replacement("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,400 @@
|
|||
// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use handles::*;
|
||||
use data::*;
|
||||
use variant::*;
|
||||
use super::*;
|
||||
// Rust 1.14.0 requires the following despite the asterisk above.
|
||||
use super::in_inclusive_range;
|
||||
use super::in_inclusive_range16;
|
||||
|
||||
pub struct ShiftJisDecoder {
|
||||
lead: Option<u8>,
|
||||
}
|
||||
|
||||
impl ShiftJisDecoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::ShiftJis(ShiftJisDecoder { lead: None })
|
||||
}
|
||||
|
||||
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_add(
|
||||
match self.lead {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
self.plus_one_if_lead(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
// worst case: 1 to 3 (half-width katakana)
|
||||
self.max_utf8_buffer_length(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_mul(3, self.plus_one_if_lead(byte_length))
|
||||
}
|
||||
|
||||
ascii_compatible_two_byte_decoder_functions!(
|
||||
{
|
||||
// If lead is between 0x81 and 0x9F, inclusive,
|
||||
// subtract offset 0x81. Else if lead is
|
||||
// between 0xE0 and 0xFC, inclusive, subtract
|
||||
// offset 0xC1. Else if lead is between
|
||||
// 0xA1 and 0xDF, inclusive, map to half-width
|
||||
// Katakana. Else if lead is 0x80, pass through.
|
||||
let mut non_ascii_minus_offset =
|
||||
non_ascii.wrapping_sub(0x81);
|
||||
if non_ascii_minus_offset > (0x9F - 0x81) {
|
||||
let non_ascii_minus_range_start = non_ascii.wrapping_sub(0xE0);
|
||||
if non_ascii_minus_range_start > (0xFC - 0xE0) {
|
||||
let non_ascii_minus_half_with_katakana_start = non_ascii.wrapping_sub(0xA1);
|
||||
if non_ascii_minus_half_with_katakana_start > (0xDF - 0xA1) {
|
||||
if non_ascii == 0x80 {
|
||||
handle.write_mid_bmp(0x80);
|
||||
// Not caring about optimizing subsequent non-ASCII
|
||||
continue 'outermost;
|
||||
}
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
handle.write_upper_bmp(0xFF61 + non_ascii_minus_half_with_katakana_start as u16);
|
||||
// Not caring about optimizing subsequent non-ASCII
|
||||
continue 'outermost;
|
||||
}
|
||||
non_ascii_minus_offset = non_ascii - 0xC1;
|
||||
}
|
||||
non_ascii_minus_offset
|
||||
},
|
||||
{
|
||||
// If trail is between 0x40 and 0x7E, inclusive,
|
||||
// subtract offset 0x40. Else if trail is
|
||||
// between 0x80 and 0xFC, inclusive, subtract
|
||||
// offset 0x41.
|
||||
// Fast-track Hiragana (60% according to Lunde)
|
||||
// and Katakana (10% acconding to Lunde).
|
||||
// Hiragana doesn't cross 0x7F, but Katakana does.
|
||||
// We can check for Hiragana before normalizing
|
||||
// trail.
|
||||
let trail_minus_hiragana = byte.wrapping_sub(0x9F);
|
||||
if lead_minus_offset == 0x01 && trail_minus_hiragana < 0x53 {
|
||||
// Hiragana
|
||||
handle.write_upper_bmp(0x3041 + trail_minus_hiragana as u16)
|
||||
} else {
|
||||
let mut trail_minus_offset =
|
||||
byte.wrapping_sub(0x40);
|
||||
if trail_minus_offset > (0x7E - 0x40) {
|
||||
let trail_minus_range_start =
|
||||
byte.wrapping_sub(0x80);
|
||||
if trail_minus_range_start > (0xFC - 0x80) {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
trail_minus_offset = byte - 0x41;
|
||||
}
|
||||
if lead_minus_offset == 0x02 &&
|
||||
trail_minus_offset < 0x56 {
|
||||
// Katakana
|
||||
handle.write_upper_bmp(0x30A1 + trail_minus_offset as u16)
|
||||
} else {
|
||||
let pointer = lead_minus_offset as usize *
|
||||
188usize +
|
||||
trail_minus_offset as usize;
|
||||
let level1_pointer = pointer.wrapping_sub(1410);
|
||||
if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
|
||||
} else {
|
||||
let level2_pointer = pointer.wrapping_sub(4418);
|
||||
if level2_pointer <
|
||||
JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer])
|
||||
} else {
|
||||
let upper_ibm_pointer = pointer.wrapping_sub(10744);
|
||||
if upper_ibm_pointer < IBM_KANJI.len() {
|
||||
handle.write_upper_bmp(IBM_KANJI[upper_ibm_pointer])
|
||||
} else {
|
||||
let lower_ibm_pointer = pointer.wrapping_sub(8272);
|
||||
if lower_ibm_pointer < IBM_KANJI.len() {
|
||||
handle.write_upper_bmp(IBM_KANJI[lower_ibm_pointer])
|
||||
} else if in_inclusive_range(pointer, 8836, 10715) {
|
||||
handle.write_upper_bmp((0xE000 - 8836 + pointer) as u16)
|
||||
} else if let Some(bmp) = jis0208_symbol_decode(pointer) {
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else if let Some(bmp) = jis0208_range_decode(pointer) {
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
self,
|
||||
non_ascii,
|
||||
byte,
|
||||
lead_minus_offset,
|
||||
unread_handle_trail,
|
||||
source,
|
||||
handle,
|
||||
'outermost,
|
||||
copy_ascii_from_check_space_bmp,
|
||||
check_space_bmp,
|
||||
false);
|
||||
}
|
||||
|
||||
pub struct ShiftJisEncoder;
|
||||
|
||||
impl ShiftJisEncoder {
|
||||
pub fn new(encoding: &'static Encoding) -> Encoder {
|
||||
Encoder::new(encoding, VariantEncoder::ShiftJis(ShiftJisEncoder))
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(&self,
|
||||
u16_length: usize)
|
||||
-> Option<usize> {
|
||||
u16_length.checked_mul(2)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(&self,
|
||||
byte_length: usize)
|
||||
-> Option<usize> {
|
||||
byte_length.checked_add(1)
|
||||
}
|
||||
|
||||
ascii_compatible_bmp_encoder_functions!(
|
||||
{
|
||||
// Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
|
||||
let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
|
||||
if bmp_minus_hiragana < 0x53 {
|
||||
handle.write_two(0x82, 0x9F + bmp_minus_hiragana as u8)
|
||||
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
|
||||
if let Some((lead, trail)) = jis0208_level1_kanji_shift_jis_encode(bmp) {
|
||||
handle.write_two(lead, trail)
|
||||
} else {
|
||||
let pointer = if 0x4EDD == bmp {
|
||||
// Ideograph on the symbol row!
|
||||
23
|
||||
} else if let Some(pos) =
|
||||
jis0208_level2_and_additional_kanji_encode(bmp) {
|
||||
4418 + pos
|
||||
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
|
||||
10744 + pos
|
||||
} else {
|
||||
return (EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
};
|
||||
let lead = pointer / 188;
|
||||
let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
|
||||
let trail = pointer % 188;
|
||||
let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
|
||||
handle.write_two((lead + lead_offset) as u8, (trail + trail_offset) as u8)
|
||||
}
|
||||
} else {
|
||||
let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
|
||||
if bmp_minus_katakana < 0x56 {
|
||||
let trail_offset = if bmp_minus_katakana < 0x3F {
|
||||
0x40
|
||||
} else {
|
||||
0x41
|
||||
};
|
||||
handle.write_two(0x83, (trail_offset + bmp_minus_katakana) as u8)
|
||||
} else {
|
||||
let bmp_minus_space = bmp.wrapping_sub(0x3000);
|
||||
if bmp_minus_space < 3 {
|
||||
// fast-track common punctuation
|
||||
handle.write_two(0x81, 0x40 + bmp_minus_space as u8)
|
||||
} else if bmp == 0xA5 {
|
||||
handle.write_one(0x5Cu8)
|
||||
} else if bmp == 0x80 {
|
||||
handle.write_one(0x80u8)
|
||||
} else if bmp == 0x203E {
|
||||
handle.write_one(0x7Eu8)
|
||||
} else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
|
||||
handle.write_one((bmp - (0xFF61 - 0xA1)) as u8)
|
||||
} else if bmp == 0x2212 {
|
||||
handle.write_two(0x81u8, 0x7Cu8)
|
||||
} else {
|
||||
let bmp_minus_roman = bmp.wrapping_sub(0x2170);
|
||||
let pointer = if bmp_minus_roman <= (0x2179 - 0x2170) {
|
||||
10716 + bmp_minus_roman as usize
|
||||
} else if let Some(pointer) = jis0208_range_encode(bmp) {
|
||||
pointer
|
||||
} else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) || bmp == 0xF929 ||
|
||||
bmp == 0xF9DC {
|
||||
// Guaranteed to be found in IBM_KANJI
|
||||
let pos = position(&IBM_KANJI[..], bmp).unwrap();
|
||||
10744 + pos
|
||||
} else if let Some(pointer) = jis0208_symbol_encode(bmp) {
|
||||
pointer
|
||||
} else {
|
||||
return (EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
};
|
||||
let lead = pointer / 188;
|
||||
let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
|
||||
let trail = pointer % 188;
|
||||
let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
|
||||
handle.write_two((lead + lead_offset) as u8, (trail + trail_offset) as u8)
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
bmp,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_two,
|
||||
check_space_two,
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_shift_jis(bytes: &[u8], expect: &str) {
|
||||
decode(SHIFT_JIS, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_shift_jis(string: &str, expect: &[u8]) {
|
||||
encode(SHIFT_JIS, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shift_jis_decode() {
|
||||
// Empty
|
||||
decode_shift_jis(b"", &"");
|
||||
|
||||
// ASCII
|
||||
decode_shift_jis(b"\x61\x62", "\u{0061}\u{0062}");
|
||||
|
||||
// Half-width
|
||||
decode_shift_jis(b"\xA1", "\u{FF61}");
|
||||
decode_shift_jis(b"\xDF", "\u{FF9F}");
|
||||
decode_shift_jis(b"\xA0", "\u{FFFD}");
|
||||
decode_shift_jis(b"\xE0", "\u{FFFD}");
|
||||
decode_shift_jis(b"\xA0+", "\u{FFFD}+");
|
||||
decode_shift_jis(b"\xE0+", "\u{FFFD}+");
|
||||
|
||||
// EUDC
|
||||
decode_shift_jis(b"\xF0\x40", "\u{E000}");
|
||||
decode_shift_jis(b"\xF9\xFC", "\u{E757}");
|
||||
decode_shift_jis(b"\xEF\xFC", "\u{FFFD}");
|
||||
decode_shift_jis(b"\xFA\x40", "\u{2170}");
|
||||
|
||||
// JIS 0208
|
||||
decode_shift_jis(b"\x81\x40", "\u{3000}");
|
||||
decode_shift_jis(b"\x81\x3F", "\u{FFFD}?");
|
||||
decode_shift_jis(b"\xEE\xFC", "\u{FF02}");
|
||||
decode_shift_jis(b"\xEE\xFD", "\u{FFFD}");
|
||||
decode_shift_jis(b"\xFA\x40", "\u{2170}");
|
||||
decode_shift_jis(b"\xFA\x3F", "\u{FFFD}?");
|
||||
decode_shift_jis(b"\xFC\x4B", "\u{9ED1}");
|
||||
decode_shift_jis(b"\xFC\x4C", "\u{FFFD}L");
|
||||
//
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shift_jis_encode() {
|
||||
// Empty
|
||||
encode_shift_jis("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_shift_jis("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
// Exceptional code points
|
||||
encode_shift_jis("\u{0080}", b"\x80");
|
||||
encode_shift_jis("\u{00A5}", b"\x5C");
|
||||
encode_shift_jis("\u{203E}", b"\x7E");
|
||||
encode_shift_jis("\u{2212}", b"\x81\x7C");
|
||||
|
||||
// Half-width
|
||||
encode_shift_jis("\u{FF61}", b"\xA1");
|
||||
encode_shift_jis("\u{FF9F}", b"\xDF");
|
||||
|
||||
// EUDC
|
||||
encode_shift_jis("\u{E000}", b"");
|
||||
encode_shift_jis("\u{E757}", b"");
|
||||
|
||||
// JIS 0212
|
||||
encode_shift_jis("\u{02D8}", b"˘");
|
||||
|
||||
// JIS 0208
|
||||
encode_shift_jis("\u{3000}", b"\x81\x40");
|
||||
encode_shift_jis("\u{FF02}", b"\xFA\x57");
|
||||
encode_shift_jis("\u{2170}", b"\xFA\x40");
|
||||
encode_shift_jis("\u{9ED1}", b"\xFC\x4B");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shift_jis_decode_all() {
|
||||
let input = include_bytes!("test_data/shift_jis_in.txt");
|
||||
let expectation = include_str!("test_data/shift_jis_in_ref.txt");
|
||||
let (cow, had_errors) = SHIFT_JIS.decode_without_bom_handling(input);
|
||||
assert!(had_errors, "Should have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shift_jis_encode_all() {
|
||||
let input = include_str!("test_data/shift_jis_out.txt");
|
||||
let expectation = include_bytes!("test_data/shift_jis_out_ref.txt");
|
||||
let (cow, encoding, had_errors) = SHIFT_JIS.encode(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(encoding, SHIFT_JIS);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shift_jis_half_width_katakana_length() {
|
||||
let mut output = [0u8; 20];
|
||||
let mut decoder = SHIFT_JIS.new_decoder();
|
||||
{
|
||||
let needed = decoder
|
||||
.max_utf8_buffer_length_without_replacement(1)
|
||||
.unwrap();
|
||||
let (result, read, written) =
|
||||
decoder.decode_to_utf8_without_replacement(b"\xA1", &mut output[..needed], true);
|
||||
assert_eq!(result, DecoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 3);
|
||||
assert_eq!(output[0], 0xEF);
|
||||
assert_eq!(output[1], 0xBD);
|
||||
assert_eq!(output[2], 0xA1);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,247 @@
|
|||
// Copyright 2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use simd::u8x16;
|
||||
use simd::i8x16;
|
||||
use simd::u16x8;
|
||||
use simd::i16x8;
|
||||
use simd::Simd;
|
||||
|
||||
extern "platform-intrinsic" {
|
||||
fn simd_shuffle16<T: Simd, U: Simd<Elem = T::Elem>>(x: T, y: T, idx: [u32; 16]) -> U;
|
||||
fn x86_mm_packus_epi16(x: i16x8, y: i16x8) -> u8x16;
|
||||
fn x86_mm_movemask_epi8(x: i8x16) -> i32;
|
||||
}
|
||||
|
||||
// TODO: Migrate unaligned access to stdlib code if/when the RFC
|
||||
// https://github.com/rust-lang/rfcs/pull/1725 is implemented.
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn load16_unaligned(ptr: *const u8) -> u8x16 {
|
||||
let mut simd = ::std::mem::uninitialized();
|
||||
::std::ptr::copy_nonoverlapping(ptr, &mut simd as *mut u8x16 as *mut u8, 16);
|
||||
simd
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn load16_aligned(ptr: *const u8) -> u8x16 {
|
||||
*(ptr as *const u8x16)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn store16_unaligned(ptr: *mut u8, s: u8x16) {
|
||||
::std::ptr::copy_nonoverlapping(&s as *const u8x16 as *const u8, ptr, 16);
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn store16_aligned(ptr: *mut u8, s: u8x16) {
|
||||
*(ptr as *mut u8x16) = s;
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn load8_unaligned(ptr: *const u16) -> u16x8 {
|
||||
let mut simd = ::std::mem::uninitialized();
|
||||
::std::ptr::copy_nonoverlapping(ptr as *const u8, &mut simd as *mut u16x8 as *mut u8, 16);
|
||||
simd
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn load8_aligned(ptr: *const u16) -> u16x8 {
|
||||
*(ptr as *const u16x8)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn store8_unaligned(ptr: *mut u16, s: u16x8) {
|
||||
::std::ptr::copy_nonoverlapping(&s as *const u16x8 as *const u8, ptr as *mut u8, 16);
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn store8_aligned(ptr: *mut u16, s: u16x8) {
|
||||
*(ptr as *mut u16x8) = s;
|
||||
}
|
||||
|
||||
/// _mm_movemask_epi8 in SSE2. vec_all_lt in AltiVec.
|
||||
#[inline(always)]
|
||||
pub fn is_ascii(s: u8x16) -> bool {
|
||||
unsafe {
|
||||
let signed: i8x16 = ::std::mem::transmute_copy(&s);
|
||||
x86_mm_movemask_epi8(signed) == 0
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// _mm_movemask_epi8 in SSE2.
|
||||
#[inline(always)]
|
||||
pub fn check_ascii(s: u8x16) -> Option<usize> {
|
||||
let mask = unsafe {
|
||||
let signed: i8x16 = ::std::mem::transmute_copy(&s);
|
||||
x86_mm_movemask_epi8(signed)
|
||||
};
|
||||
if mask == 0 {
|
||||
return None;
|
||||
}
|
||||
// We don't extract the non-ascii byte from the SIMD register, because
|
||||
// at least on Haswell, it seems faster to let the caller re-read it from
|
||||
// memory.
|
||||
Some(mask.trailing_zeros() as usize)
|
||||
}
|
||||
|
||||
/// vzipq_u8 in NEON. _mm_unpacklo_epi8 and
|
||||
/// _mm_unpackhi_epi8 in SSE2. vec_mergeh and vec_mergel or vec_unpackh and
|
||||
/// vec_unpackl in AltiVec.
|
||||
#[inline(always)]
|
||||
pub fn unpack(s: u8x16) -> (u16x8, u16x8) {
|
||||
unsafe {
|
||||
let first: u8x16 = simd_shuffle16(
|
||||
s,
|
||||
u8x16::splat(0),
|
||||
[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
|
||||
);
|
||||
let second: u8x16 = simd_shuffle16(
|
||||
s,
|
||||
u8x16::splat(0),
|
||||
[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
|
||||
);
|
||||
(::std::mem::transmute_copy(&first), ::std::mem::transmute_copy(&second))
|
||||
}
|
||||
}
|
||||
|
||||
/// vuzpq_u8 in NEON. _mm_packus_epi16 in SSE2. vec_packsu *followed* by ASCII
|
||||
/// check in AltiVec.
|
||||
#[inline(always)]
|
||||
pub unsafe fn pack_basic_latin(a: u16x8, b: u16x8) -> Option<u8x16> {
|
||||
// If the 16-bit lane is out of range positive, the 8-bit lane becomes 0xFF
|
||||
// when packing, which would allow us to pack later and then check for
|
||||
// ASCII, but if the 16-bit lane is negative, the 8-bit lane becomes 0x00.
|
||||
// Sigh. Hence, check first.
|
||||
let above_ascii = u16x8::splat(0x80);
|
||||
if a.lt(above_ascii).all() && b.lt(above_ascii).all() {
|
||||
let first: i16x8 = ::std::mem::transmute_copy(&a);
|
||||
let second: i16x8 = ::std::mem::transmute_copy(&b);
|
||||
Some(x86_mm_packus_epi16(first, second))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_unpack() {
|
||||
let ascii: [u8; 16] = [0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71,
|
||||
0x72, 0x73, 0x74, 0x75, 0x76];
|
||||
let basic_latin: [u16; 16] = [0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70,
|
||||
0x71, 0x72, 0x73, 0x74, 0x75, 0x76];
|
||||
let simd = unsafe { load16_unaligned(ascii.as_ptr()) };
|
||||
let mut vec = Vec::with_capacity(16);
|
||||
vec.resize(16, 0u16);
|
||||
let (first, second) = unpack(simd);
|
||||
let ptr = vec.as_mut_ptr();
|
||||
unsafe {
|
||||
store8_unaligned(ptr, first);
|
||||
store8_unaligned(ptr.offset(8), second);
|
||||
}
|
||||
assert_eq!(&vec[..], &basic_latin[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pack_basic_latin_success() {
|
||||
let ascii: [u8; 16] = [0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71,
|
||||
0x72, 0x73, 0x74, 0x75, 0x76];
|
||||
let basic_latin: [u16; 16] = [0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70,
|
||||
0x71, 0x72, 0x73, 0x74, 0x75, 0x76];
|
||||
let first = unsafe { load8_unaligned(basic_latin.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(basic_latin.as_ptr().offset(8)) };
|
||||
let mut vec = Vec::with_capacity(16);
|
||||
vec.resize(16, 0u8);
|
||||
let ptr = vec.as_mut_ptr();
|
||||
unsafe {
|
||||
let packed = pack_basic_latin(first, second).unwrap();
|
||||
store16_unaligned(ptr, packed);
|
||||
}
|
||||
assert_eq!(&vec[..], &ascii[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pack_basic_latin_c0() {
|
||||
let input: [u16; 16] = [0x61, 0x62, 0x63, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71,
|
||||
0x72, 0x73, 0x74, 0x75, 0x76];
|
||||
let first = unsafe { load8_unaligned(input.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().offset(8)) };
|
||||
unsafe {
|
||||
assert!(pack_basic_latin(first, second).is_none());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pack_basic_latin_0fff() {
|
||||
let input: [u16; 16] = [0x61, 0x62, 0x63, 0x0FFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70,
|
||||
0x71, 0x72, 0x73, 0x74, 0x75, 0x76];
|
||||
let first = unsafe { load8_unaligned(input.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().offset(8)) };
|
||||
unsafe {
|
||||
assert!(pack_basic_latin(first, second).is_none());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pack_basic_latin_ffff() {
|
||||
let input: [u16; 16] = [0x61, 0x62, 0x63, 0xFFFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70,
|
||||
0x71, 0x72, 0x73, 0x74, 0x75, 0x76];
|
||||
let first = unsafe { load8_unaligned(input.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().offset(8)) };
|
||||
unsafe {
|
||||
assert!(pack_basic_latin(first, second).is_none());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_ascii_success() {
|
||||
let ascii: [u8; 16] = [0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71,
|
||||
0x72, 0x73, 0x74, 0x75, 0x76];
|
||||
let simd = unsafe { load16_unaligned(ascii.as_ptr()) };
|
||||
assert!(is_ascii(simd));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_ascii_failure() {
|
||||
let input: [u8; 16] = [0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71,
|
||||
0x72, 0x73, 0x74, 0x75, 0x76];
|
||||
let simd = unsafe { load16_unaligned(input.as_ptr()) };
|
||||
assert!(!is_ascii(simd));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_check_ascii() {
|
||||
let input: [u8; 16] = [0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71,
|
||||
0x72, 0x73, 0x74, 0x75, 0x76];
|
||||
let simd = unsafe { load16_unaligned(input.as_ptr()) };
|
||||
match check_ascii(simd) {
|
||||
None => unreachable!(),
|
||||
Some(consumed) => {
|
||||
assert_eq!(consumed, 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_alu() {
|
||||
let input: [u8; 16] = [0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71,
|
||||
0x72, 0x73, 0x74, 0x75, 0x76];
|
||||
let mut alu = 0u64;
|
||||
unsafe {
|
||||
::std::ptr::copy_nonoverlapping(input.as_ptr(), &mut alu as *mut u64 as *mut u8, 8);
|
||||
}
|
||||
let masked = alu & 0x8080808080808080;
|
||||
assert_eq!(masked.trailing_zeros(), 39);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,627 @@
|
|||
// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use handles::*;
|
||||
use variant::*;
|
||||
use ascii::*;
|
||||
use super::*;
|
||||
|
||||
pub struct SingleByteDecoder {
|
||||
table: &'static [u16; 128],
|
||||
}
|
||||
|
||||
impl SingleByteDecoder {
|
||||
pub fn new(data: &'static [u16; 128]) -> VariantDecoder {
|
||||
VariantDecoder::SingleByte(SingleByteDecoder { table: data })
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
Some(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_mul(3)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_mul(3)
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8_raw(&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u8],
|
||||
_last: bool)
|
||||
-> (DecoderResult, usize, usize) {
|
||||
let mut source = ByteSource::new(src);
|
||||
let mut dest = Utf8Destination::new(dst);
|
||||
'outermost: loop {
|
||||
match dest.copy_ascii_from_check_space_bmp(&mut source) {
|
||||
CopyAsciiResult::Stop(ret) => return ret,
|
||||
CopyAsciiResult::GoOn((mut non_ascii, mut handle)) => {
|
||||
'middle: loop {
|
||||
// Start non-boilerplate
|
||||
//
|
||||
// Since the non-ASCIIness of `non_ascii` is hidden from
|
||||
// the optimizer, it can't figure out that it's OK to
|
||||
// statically omit the bound check when accessing
|
||||
// `[u16; 128]` with an index
|
||||
// `non_ascii as usize - 0x80usize`.
|
||||
let mapped =
|
||||
unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
|
||||
// let mapped = self.table[non_ascii as usize - 0x80usize];
|
||||
if mapped == 0u16 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
let dest_again = handle.write_bmp_excl_ascii(mapped);
|
||||
// End non-boilerplate
|
||||
match source.check_available() {
|
||||
Space::Full(src_consumed) => {
|
||||
return (DecoderResult::InputEmpty,
|
||||
src_consumed,
|
||||
dest_again.written());
|
||||
}
|
||||
Space::Available(source_handle) => {
|
||||
match dest_again.check_space_bmp() {
|
||||
Space::Full(dst_written) => {
|
||||
return (DecoderResult::OutputFull,
|
||||
source_handle.consumed(),
|
||||
dst_written);
|
||||
}
|
||||
Space::Available(mut destination_handle) => {
|
||||
let (mut b, unread_handle) = source_handle.read();
|
||||
let source_again = unread_handle.commit();
|
||||
'innermost: loop {
|
||||
if b > 127 {
|
||||
non_ascii = b;
|
||||
handle = destination_handle;
|
||||
continue 'middle;
|
||||
}
|
||||
// Testing on Haswell says that we should write the
|
||||
// byte unconditionally instead of trying to unread it
|
||||
// to make it part of the next SIMD stride.
|
||||
let dest_again_again =
|
||||
destination_handle.write_ascii(b);
|
||||
if b < 60 {
|
||||
// We've got punctuation
|
||||
match source_again.check_available() {
|
||||
Space::Full(src_consumed_again) => {
|
||||
return (DecoderResult::InputEmpty,
|
||||
src_consumed_again,
|
||||
dest_again_again.written());
|
||||
}
|
||||
Space::Available(source_handle_again) => {
|
||||
match dest_again_again.check_space_bmp() {
|
||||
Space::Full(dst_written_again) => {
|
||||
return (DecoderResult::OutputFull,
|
||||
source_handle_again
|
||||
.consumed(),
|
||||
dst_written_again);
|
||||
}
|
||||
Space::Available(destination_handle_again) => {
|
||||
{
|
||||
let (b_again, _unread_handle_again) =
|
||||
source_handle_again.read();
|
||||
b = b_again;
|
||||
destination_handle = destination_handle_again;
|
||||
continue 'innermost;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// We've got markup or ASCII text
|
||||
continue 'outermost;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_to_utf16_raw(&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u16],
|
||||
_last: bool)
|
||||
-> (DecoderResult, usize, usize) {
|
||||
let (pending, length) = if dst.len() < src.len() {
|
||||
(DecoderResult::OutputFull, dst.len())
|
||||
} else {
|
||||
(DecoderResult::InputEmpty, src.len())
|
||||
};
|
||||
let mut converted = 0usize;
|
||||
'outermost: loop {
|
||||
match unsafe {
|
||||
ascii_to_basic_latin(
|
||||
src.as_ptr().offset(converted as isize),
|
||||
dst.as_mut_ptr().offset(converted as isize),
|
||||
length - converted,
|
||||
)
|
||||
} {
|
||||
None => {
|
||||
return (pending, length, length);
|
||||
}
|
||||
Some((mut non_ascii, consumed)) => {
|
||||
converted += consumed;
|
||||
'middle: loop {
|
||||
// `converted` doesn't count the reading of `non_ascii` yet.
|
||||
// Since the non-ASCIIness of `non_ascii` is hidden from
|
||||
// the optimizer, it can't figure out that it's OK to
|
||||
// statically omit the bound check when accessing
|
||||
// `[u16; 128]` with an index
|
||||
// `non_ascii as usize - 0x80usize`.
|
||||
let mapped =
|
||||
unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
|
||||
// let mapped = self.table[non_ascii as usize - 0x80usize];
|
||||
if mapped == 0u16 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
converted + 1, // +1 `for non_ascii`
|
||||
converted);
|
||||
}
|
||||
unsafe {
|
||||
// The bound check has already been performed
|
||||
*(dst.get_unchecked_mut(converted)) = mapped;
|
||||
}
|
||||
converted += 1;
|
||||
// Next, handle ASCII punctuation and non-ASCII without
|
||||
// going back to ASCII acceleration. Non-ASCII scripts
|
||||
// use ASCII punctuation, so this avoid going to
|
||||
// acceleration just for punctuation/space and then
|
||||
// failing. This is a significant boost to non-ASCII
|
||||
// scripts.
|
||||
// TODO: Split out Latin converters without this part
|
||||
// this stuff makes Latin script-conversion slower.
|
||||
if converted == length {
|
||||
return (pending, length, length);
|
||||
}
|
||||
let mut b = unsafe { *(src.get_unchecked(converted)) };
|
||||
'innermost: loop {
|
||||
if b > 127 {
|
||||
non_ascii = b;
|
||||
continue 'middle;
|
||||
}
|
||||
// Testing on Haswell says that we should write the
|
||||
// byte unconditionally instead of trying to unread it
|
||||
// to make it part of the next SIMD stride.
|
||||
unsafe {
|
||||
*(dst.get_unchecked_mut(converted)) = b as u16;
|
||||
}
|
||||
converted += 1;
|
||||
if b < 60 {
|
||||
// We've got punctuation
|
||||
if converted == length {
|
||||
return (pending, length, length);
|
||||
}
|
||||
b = unsafe { *(src.get_unchecked(converted)) };
|
||||
continue 'innermost;
|
||||
}
|
||||
// We've got markup or ASCII text
|
||||
continue 'outermost;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SingleByteEncoder {
|
||||
table: &'static [u16; 128],
|
||||
}
|
||||
|
||||
impl SingleByteEncoder {
|
||||
pub fn new(encoding: &'static Encoding, data: &'static [u16; 128]) -> Encoder {
|
||||
Encoder::new(
|
||||
encoding,
|
||||
VariantEncoder::SingleByte(SingleByteEncoder { table: data }),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(&self,
|
||||
u16_length: usize)
|
||||
-> Option<usize> {
|
||||
Some(u16_length)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(&self,
|
||||
byte_length: usize)
|
||||
-> Option<usize> {
|
||||
Some(byte_length)
|
||||
}
|
||||
|
||||
fn encode_u16(&self, code_unit: u16) -> Option<u8> {
|
||||
// We search the quadrants in reverse order, but we search forward
|
||||
// within each quadrant. For Windows and ISO encodings, this is
|
||||
// generally faster than just searching the whole table backwards.
|
||||
// (Exceptions: English, German, Czech.) This order is also OK for
|
||||
// KOI encodings. For IBM and Mac encodings, this order is bad,
|
||||
// but we don't really need to optimize for those encodings anyway.
|
||||
|
||||
// In Windows and ISO encodings, the fourth quadrant holds most of the
|
||||
// lower-case letters for bicameral scripts as well as the Hebrew
|
||||
// letters. There are some Thai letters and combining marks as well as
|
||||
// Thai numerals here. (In KOI8-R, the upper-case letters are here.)
|
||||
for i in 96..128 {
|
||||
if self.table[i] == code_unit {
|
||||
return Some((i + 128) as u8);
|
||||
}
|
||||
}
|
||||
|
||||
// In Windows and ISO encodings, the third quadrant holds most of the
|
||||
// upper-case letters for bicameral scripts as well as most of the
|
||||
// Arabic letters. Searching this quadrant first would be better for
|
||||
// Arabic. There are a number of Thai letters and combining marks here.
|
||||
// (In KOI8-R, the lower-case letters are here.)
|
||||
for i in 64..96 {
|
||||
if self.table[i] == code_unit {
|
||||
return Some((i + 128) as u8);
|
||||
}
|
||||
}
|
||||
|
||||
// In Windows and ISO encodings, the second quadrant hold most of the
|
||||
// Thai letters. In other scripts, there tends to be symbols here.
|
||||
// Even though the two quadrants above are relevant for Thai, for Thai
|
||||
// it would likely be optimal to search this quadrant first. :-(
|
||||
for i in 32..64 {
|
||||
if self.table[i] == code_unit {
|
||||
return Some((i + 128) as u8);
|
||||
}
|
||||
}
|
||||
|
||||
// The first quadrant is useless in ISO encodings. In Windows encodings,
|
||||
// there is useful punctuation here that might warrant searching
|
||||
// before the symbols in the second quadrant, but the second quadrant
|
||||
// is searched before this one for the benefit of Thai.
|
||||
for i in 0..32 {
|
||||
if self.table[i] == code_unit {
|
||||
return Some((i + 128) as u8);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
ascii_compatible_bmp_encoder_function!(
|
||||
{
|
||||
match self.encode_u16(bmp) {
|
||||
Some(byte) => handle.write_one(byte),
|
||||
None => {
|
||||
return (EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
}
|
||||
},
|
||||
bmp,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_one,
|
||||
check_space_one,
|
||||
encode_from_utf8_raw,
|
||||
str,
|
||||
Utf8Source,
|
||||
true
|
||||
);
|
||||
|
||||
pub fn encode_from_utf16_raw(&mut self,
|
||||
src: &[u16],
|
||||
dst: &mut [u8],
|
||||
_last: bool)
|
||||
-> (EncoderResult, usize, usize) {
|
||||
let (pending, length) = if dst.len() < src.len() {
|
||||
(EncoderResult::OutputFull, dst.len())
|
||||
} else {
|
||||
(EncoderResult::InputEmpty, src.len())
|
||||
};
|
||||
let mut converted = 0usize;
|
||||
'outermost: loop {
|
||||
match unsafe {
|
||||
basic_latin_to_ascii(
|
||||
src.as_ptr().offset(converted as isize),
|
||||
dst.as_mut_ptr().offset(converted as isize),
|
||||
length - converted,
|
||||
)
|
||||
} {
|
||||
None => {
|
||||
return (pending, length, length);
|
||||
}
|
||||
Some((mut non_ascii, consumed)) => {
|
||||
converted += consumed;
|
||||
'middle: loop {
|
||||
// `converted` doesn't count the reading of `non_ascii` yet.
|
||||
match self.encode_u16(non_ascii) {
|
||||
Some(byte) => {
|
||||
unsafe {
|
||||
*(dst.get_unchecked_mut(converted)) = byte;
|
||||
}
|
||||
converted += 1;
|
||||
}
|
||||
None => {
|
||||
// At this point, we need to know if we
|
||||
// have a surrogate.
|
||||
let high_bits = non_ascii & 0xFC00u16;
|
||||
if high_bits == 0xD800u16 {
|
||||
// high surrogate
|
||||
if converted + 1 == length {
|
||||
// End of buffer. This surrogate is unpaired.
|
||||
return (EncoderResult::Unmappable('\u{FFFD}'),
|
||||
converted + 1, // +1 `for non_ascii`
|
||||
converted);
|
||||
}
|
||||
let second = unsafe { *src.get_unchecked(converted + 1) } as
|
||||
u32;
|
||||
if second & 0xFC00u32 != 0xDC00u32 {
|
||||
return (EncoderResult::Unmappable('\u{FFFD}'),
|
||||
converted + 1, // +1 `for non_ascii`
|
||||
converted);
|
||||
}
|
||||
// The next code unit is a low surrogate.
|
||||
let astral: char = unsafe {
|
||||
::std::mem::transmute(
|
||||
((non_ascii as u32) << 10) + second -
|
||||
(((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32)
|
||||
)
|
||||
};
|
||||
return (EncoderResult::Unmappable(astral),
|
||||
converted + 2, // +2 `for non_ascii` and `second`
|
||||
converted);
|
||||
}
|
||||
if high_bits == 0xDC00u16 {
|
||||
// Unpaired low surrogate
|
||||
return (EncoderResult::Unmappable('\u{FFFD}'),
|
||||
converted + 1, // +1 `for non_ascii`
|
||||
converted);
|
||||
}
|
||||
let thirty_two = non_ascii as u32;
|
||||
let bmp: char = unsafe { ::std::mem::transmute(thirty_two) };
|
||||
return (EncoderResult::Unmappable(bmp),
|
||||
converted + 1, // +1 `for non_ascii`
|
||||
converted);
|
||||
}
|
||||
}
|
||||
// Next, handle ASCII punctuation and non-ASCII without
|
||||
// going back to ASCII acceleration. Non-ASCII scripts
|
||||
// use ASCII punctuation, so this avoid going to
|
||||
// acceleration just for punctuation/space and then
|
||||
// failing. This is a significant boost to non-ASCII
|
||||
// scripts.
|
||||
// TODO: Split out Latin converters without this part
|
||||
// this stuff makes Latin script-conversion slower.
|
||||
if converted == length {
|
||||
return (pending, length, length);
|
||||
}
|
||||
let mut unit = unsafe { *(src.get_unchecked(converted)) };
|
||||
'innermost: loop {
|
||||
if unit > 127 {
|
||||
non_ascii = unit;
|
||||
continue 'middle;
|
||||
}
|
||||
// Testing on Haswell says that we should write the
|
||||
// byte unconditionally instead of trying to unread it
|
||||
// to make it part of the next SIMD stride.
|
||||
unsafe {
|
||||
*(dst.get_unchecked_mut(converted)) = unit as u8;
|
||||
}
|
||||
converted += 1;
|
||||
if unit < 60 {
|
||||
// We've got punctuation
|
||||
if converted == length {
|
||||
return (pending, length, length);
|
||||
}
|
||||
unit = unsafe { *(src.get_unchecked(converted)) };
|
||||
continue 'innermost;
|
||||
}
|
||||
// We've got markup or ASCII text
|
||||
continue 'outermost;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::data::*;
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
#[test]
|
||||
fn test_windows_1255_ca() {
|
||||
decode(WINDOWS_1255, b"\xCA", "\u{05BA}");
|
||||
encode(WINDOWS_1255, "\u{05BA}", b"\xCA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ascii_punctuation() {
|
||||
let bytes = b"\xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4. \xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4.";
|
||||
let characters = "\u{0391}\u{03C5}\u{03C4}\u{03CC} \
|
||||
\u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
|
||||
\u{03C4}\u{03B5}\u{03C3}\u{03C4}. \u{0391}\u{03C5}\u{03C4}\u{03CC} \
|
||||
\u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
|
||||
\u{03C4}\u{03B5}\u{03C3}\u{03C4}.";
|
||||
decode(WINDOWS_1253, bytes, characters);
|
||||
encode(WINDOWS_1253, characters, bytes);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_malformed() {
|
||||
decode(
|
||||
WINDOWS_1253,
|
||||
b"\xC1\xF5\xD2\xF4\xFC",
|
||||
"\u{0391}\u{03C5}\u{FFFD}\u{03C4}\u{03CC}",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_unmappables() {
|
||||
encode(
|
||||
WINDOWS_1253,
|
||||
"\u{0391}\u{03C5}\u{2603}\u{03C4}\u{03CC}",
|
||||
b"\xC1\xF5☃\xF4\xFC",
|
||||
);
|
||||
encode(
|
||||
WINDOWS_1253,
|
||||
"\u{0391}\u{03C5}\u{1F4A9}\u{03C4}\u{03CC}",
|
||||
b"\xC1\xF5💩\xF4\xFC",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_unpaired_surrogates() {
|
||||
encode_from_utf16(
|
||||
WINDOWS_1253,
|
||||
&[0x0391u16, 0x03C5u16, 0xDCA9u16, 0x03C4u16, 0x03CCu16],
|
||||
b"\xC1\xF5�\xF4\xFC",
|
||||
);
|
||||
encode_from_utf16(
|
||||
WINDOWS_1253,
|
||||
&[0x0391u16, 0x03C5u16, 0xD83Du16, 0x03C4u16, 0x03CCu16],
|
||||
b"\xC1\xF5�\xF4\xFC",
|
||||
);
|
||||
encode_from_utf16(
|
||||
WINDOWS_1253,
|
||||
&[0x0391u16, 0x03C5u16, 0x03C4u16, 0x03CCu16, 0xD83Du16],
|
||||
b"\xC1\xF5\xF4\xFC�",
|
||||
);
|
||||
}
|
||||
|
||||
pub const HIGH_BYTES: &'static [u8; 128] =
|
||||
&[0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D,
|
||||
0x8E, 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
|
||||
0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9,
|
||||
0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7,
|
||||
0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5,
|
||||
0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3,
|
||||
0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1,
|
||||
0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD,
|
||||
0xFE, 0xFF];
|
||||
|
||||
fn decode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
|
||||
let mut with_replacement = [0u16; 128];
|
||||
let mut it = data.iter().enumerate();
|
||||
loop {
|
||||
match it.next() {
|
||||
Some((i, code_point)) => {
|
||||
if *code_point == 0 {
|
||||
with_replacement[i] = 0xFFFD;
|
||||
} else {
|
||||
with_replacement[i] = *code_point;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
decode_to_utf16(encoding, HIGH_BYTES, &with_replacement[..]);
|
||||
}
|
||||
|
||||
fn encode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
|
||||
let mut with_zeros = [0u8; 128];
|
||||
let mut it = data.iter().enumerate();
|
||||
loop {
|
||||
match it.next() {
|
||||
Some((i, code_point)) => {
|
||||
if *code_point == 0 {
|
||||
with_zeros[i] = 0;
|
||||
} else {
|
||||
with_zeros[i] = HIGH_BYTES[i];
|
||||
}
|
||||
}
|
||||
None => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
encode_from_utf16(encoding, data, &with_zeros[..]);
|
||||
}
|
||||
|
||||
// These tests are so self-referential that they are pretty useless.
|
||||
|
||||
// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
|
||||
// Instead, please regenerate using generate-encoding-data.py
|
||||
|
||||
#[test]
|
||||
fn test_single_byte_decode() {
|
||||
decode_single_byte(IBM866, IBM866_DATA);
|
||||
decode_single_byte(ISO_8859_10, ISO_8859_10_DATA);
|
||||
decode_single_byte(ISO_8859_13, ISO_8859_13_DATA);
|
||||
decode_single_byte(ISO_8859_14, ISO_8859_14_DATA);
|
||||
decode_single_byte(ISO_8859_15, ISO_8859_15_DATA);
|
||||
decode_single_byte(ISO_8859_16, ISO_8859_16_DATA);
|
||||
decode_single_byte(ISO_8859_2, ISO_8859_2_DATA);
|
||||
decode_single_byte(ISO_8859_3, ISO_8859_3_DATA);
|
||||
decode_single_byte(ISO_8859_4, ISO_8859_4_DATA);
|
||||
decode_single_byte(ISO_8859_5, ISO_8859_5_DATA);
|
||||
decode_single_byte(ISO_8859_6, ISO_8859_6_DATA);
|
||||
decode_single_byte(ISO_8859_7, ISO_8859_7_DATA);
|
||||
decode_single_byte(ISO_8859_8, ISO_8859_8_DATA);
|
||||
decode_single_byte(KOI8_R, KOI8_R_DATA);
|
||||
decode_single_byte(KOI8_U, KOI8_U_DATA);
|
||||
decode_single_byte(MACINTOSH, MACINTOSH_DATA);
|
||||
decode_single_byte(WINDOWS_1250, WINDOWS_1250_DATA);
|
||||
decode_single_byte(WINDOWS_1251, WINDOWS_1251_DATA);
|
||||
decode_single_byte(WINDOWS_1252, WINDOWS_1252_DATA);
|
||||
decode_single_byte(WINDOWS_1253, WINDOWS_1253_DATA);
|
||||
decode_single_byte(WINDOWS_1254, WINDOWS_1254_DATA);
|
||||
decode_single_byte(WINDOWS_1255, WINDOWS_1255_DATA);
|
||||
decode_single_byte(WINDOWS_1256, WINDOWS_1256_DATA);
|
||||
decode_single_byte(WINDOWS_1257, WINDOWS_1257_DATA);
|
||||
decode_single_byte(WINDOWS_1258, WINDOWS_1258_DATA);
|
||||
decode_single_byte(WINDOWS_874, WINDOWS_874_DATA);
|
||||
decode_single_byte(X_MAC_CYRILLIC, X_MAC_CYRILLIC_DATA);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_byte_encode() {
|
||||
encode_single_byte(IBM866, IBM866_DATA);
|
||||
encode_single_byte(ISO_8859_10, ISO_8859_10_DATA);
|
||||
encode_single_byte(ISO_8859_13, ISO_8859_13_DATA);
|
||||
encode_single_byte(ISO_8859_14, ISO_8859_14_DATA);
|
||||
encode_single_byte(ISO_8859_15, ISO_8859_15_DATA);
|
||||
encode_single_byte(ISO_8859_16, ISO_8859_16_DATA);
|
||||
encode_single_byte(ISO_8859_2, ISO_8859_2_DATA);
|
||||
encode_single_byte(ISO_8859_3, ISO_8859_3_DATA);
|
||||
encode_single_byte(ISO_8859_4, ISO_8859_4_DATA);
|
||||
encode_single_byte(ISO_8859_5, ISO_8859_5_DATA);
|
||||
encode_single_byte(ISO_8859_6, ISO_8859_6_DATA);
|
||||
encode_single_byte(ISO_8859_7, ISO_8859_7_DATA);
|
||||
encode_single_byte(ISO_8859_8, ISO_8859_8_DATA);
|
||||
encode_single_byte(KOI8_R, KOI8_R_DATA);
|
||||
encode_single_byte(KOI8_U, KOI8_U_DATA);
|
||||
encode_single_byte(MACINTOSH, MACINTOSH_DATA);
|
||||
encode_single_byte(WINDOWS_1250, WINDOWS_1250_DATA);
|
||||
encode_single_byte(WINDOWS_1251, WINDOWS_1251_DATA);
|
||||
encode_single_byte(WINDOWS_1252, WINDOWS_1252_DATA);
|
||||
encode_single_byte(WINDOWS_1253, WINDOWS_1253_DATA);
|
||||
encode_single_byte(WINDOWS_1254, WINDOWS_1254_DATA);
|
||||
encode_single_byte(WINDOWS_1255, WINDOWS_1255_DATA);
|
||||
encode_single_byte(WINDOWS_1256, WINDOWS_1256_DATA);
|
||||
encode_single_byte(WINDOWS_1257, WINDOWS_1257_DATA);
|
||||
encode_single_byte(WINDOWS_1258, WINDOWS_1258_DATA);
|
||||
encode_single_byte(WINDOWS_874, WINDOWS_874_DATA);
|
||||
encode_single_byte(X_MAC_CYRILLIC, X_MAC_CYRILLIC_DATA);
|
||||
}
|
||||
// END GENERATED CODE
|
||||
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,276 @@
|
|||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
|
||||
// Instead, please regenerate using generate-encoding-data.py
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_all_labels() {
|
||||
assert_eq!(Encoding::for_label(b"l1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"l2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"l3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"l4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"l5"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"l6"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"l9"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"866"), Some(IBM866));
|
||||
assert_eq!(Encoding::for_label(b"mac"), Some(MACINTOSH));
|
||||
assert_eq!(Encoding::for_label(b"koi"), Some(KOI8_R));
|
||||
assert_eq!(Encoding::for_label(b"gbk"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"big5"), Some(BIG5));
|
||||
assert_eq!(Encoding::for_label(b"utf8"), Some(UTF_8));
|
||||
assert_eq!(Encoding::for_label(b"koi8"), Some(KOI8_R));
|
||||
assert_eq!(Encoding::for_label(b"sjis"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"ms932"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"cp866"), Some(IBM866));
|
||||
assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
|
||||
assert_eq!(Encoding::for_label(b"cp819"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"ascii"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"x-gbk"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"greek"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"cp1250"), Some(WINDOWS_1250));
|
||||
assert_eq!(Encoding::for_label(b"cp1251"), Some(WINDOWS_1251));
|
||||
assert_eq!(Encoding::for_label(b"latin1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"gb2312"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"cp1252"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"latin2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"cp1253"), Some(WINDOWS_1253));
|
||||
assert_eq!(Encoding::for_label(b"latin3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"cp1254"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"latin4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"cp1255"), Some(WINDOWS_1255));
|
||||
assert_eq!(Encoding::for_label(b"csbig5"), Some(BIG5));
|
||||
assert_eq!(Encoding::for_label(b"latin5"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"utf-16"), Some(UTF_16LE));
|
||||
assert_eq!(Encoding::for_label(b"cp1256"), Some(WINDOWS_1256));
|
||||
assert_eq!(Encoding::for_label(b"ibm866"), Some(IBM866));
|
||||
assert_eq!(Encoding::for_label(b"latin6"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"cp1257"), Some(WINDOWS_1257));
|
||||
assert_eq!(Encoding::for_label(b"cp1258"), Some(WINDOWS_1258));
|
||||
assert_eq!(Encoding::for_label(b"greek8"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"ibm819"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"arabic"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"visual"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"korean"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"euc-jp"), Some(EUC_JP));
|
||||
assert_eq!(Encoding::for_label(b"koi8-r"), Some(KOI8_R));
|
||||
assert_eq!(Encoding::for_label(b"koi8_r"), Some(KOI8_R));
|
||||
assert_eq!(Encoding::for_label(b"euc-kr"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"x-sjis"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"koi8-u"), Some(KOI8_U));
|
||||
assert_eq!(Encoding::for_label(b"hebrew"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"tis-620"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"gb18030"), Some(GB18030));
|
||||
assert_eq!(Encoding::for_label(b"ksc5601"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"gb_2312"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"dos-874"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"cn-big5"), Some(BIG5));
|
||||
assert_eq!(Encoding::for_label(b"chinese"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"logical"), Some(ISO_8859_8_I));
|
||||
assert_eq!(Encoding::for_label(b"cskoi8r"), Some(KOI8_R));
|
||||
assert_eq!(Encoding::for_label(b"cseuckr"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"koi8-ru"), Some(KOI8_U));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1250"), Some(WINDOWS_1250));
|
||||
assert_eq!(Encoding::for_label(b"ksc_5601"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1251"), Some(WINDOWS_1251));
|
||||
assert_eq!(Encoding::for_label(b"iso88591"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"csgb2312"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1252"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso88592"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1253"), Some(WINDOWS_1253));
|
||||
assert_eq!(Encoding::for_label(b"iso88593"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"ecma-114"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1254"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"iso88594"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1255"), Some(WINDOWS_1255));
|
||||
assert_eq!(Encoding::for_label(b"iso88595"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"x-x-big5"), Some(BIG5));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1256"), Some(WINDOWS_1256));
|
||||
assert_eq!(Encoding::for_label(b"csibm866"), Some(IBM866));
|
||||
assert_eq!(Encoding::for_label(b"iso88596"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1257"), Some(WINDOWS_1257));
|
||||
assert_eq!(Encoding::for_label(b"iso88597"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"asmo-708"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"ecma-118"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"elot_928"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1258"), Some(WINDOWS_1258));
|
||||
assert_eq!(Encoding::for_label(b"iso88598"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso88599"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"cyrillic"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"utf-16be"), Some(UTF_16BE));
|
||||
assert_eq!(Encoding::for_label(b"utf-16le"), Some(UTF_16LE));
|
||||
assert_eq!(Encoding::for_label(b"us-ascii"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"ms_kanji"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"x-euc-jp"), Some(EUC_JP));
|
||||
assert_eq!(Encoding::for_label(b"iso885910"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso885911"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso885913"), Some(ISO_8859_13));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"iso885914"), Some(ISO_8859_14));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-5"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"iso885915"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-6"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-7"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-8"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-58"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-9"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"macintosh"), Some(MACINTOSH));
|
||||
assert_eq!(Encoding::for_label(b"shift-jis"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"shift_jis"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-100"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-10"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-110"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"gb_2312-80"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-101"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-11"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"hz-gb-2312"), Some(REPLACEMENT));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-13"), Some(ISO_8859_13));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-14"), Some(ISO_8859_14));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-144"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-5"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-5"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-15"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-6"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-6"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-126"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-7"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-7"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-127"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-157"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-8"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-8"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-138"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-148"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-9"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-9"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-109"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-149"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"big5-hkscs"), Some(BIG5));
|
||||
assert_eq!(Encoding::for_label(b"csshiftjis"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-10"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-11"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-13"), Some(ISO_8859_13));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-14"), Some(ISO_8859_14));
|
||||
assert_eq!(Encoding::for_label(b"windows-874"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-15"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-15"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin5"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-16"), Some(ISO_8859_16));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin6"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"windows-949"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin9"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"csiso88596e"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"csiso88598e"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"csmacintosh"), Some(MACINTOSH));
|
||||
assert_eq!(Encoding::for_label(b"csiso88596i"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"csiso88598i"), Some(ISO_8859_8_I));
|
||||
assert_eq!(Encoding::for_label(b"windows-31j"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"x-mac-roman"), Some(MACINTOSH));
|
||||
assert_eq!(Encoding::for_label(b"iso-2022-cn"), Some(REPLACEMENT));
|
||||
assert_eq!(Encoding::for_label(b"iso-2022-jp"), Some(ISO_2022_JP));
|
||||
assert_eq!(Encoding::for_label(b"csiso2022jp"), Some(ISO_2022_JP));
|
||||
assert_eq!(Encoding::for_label(b"iso-2022-kr"), Some(REPLACEMENT));
|
||||
assert_eq!(Encoding::for_label(b"csiso2022kr"), Some(REPLACEMENT));
|
||||
assert_eq!(Encoding::for_label(b"windows-1250"), Some(WINDOWS_1250));
|
||||
assert_eq!(Encoding::for_label(b"windows-1251"), Some(WINDOWS_1251));
|
||||
assert_eq!(Encoding::for_label(b"windows-1252"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"windows-1253"), Some(WINDOWS_1253));
|
||||
assert_eq!(Encoding::for_label(b"windows-1254"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"windows-1255"), Some(WINDOWS_1255));
|
||||
assert_eq!(Encoding::for_label(b"windows-1256"), Some(WINDOWS_1256));
|
||||
assert_eq!(Encoding::for_label(b"windows-1257"), Some(WINDOWS_1257));
|
||||
assert_eq!(Encoding::for_label(b"windows-1258"), Some(WINDOWS_1258));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-6-e"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-8-e"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-6-i"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-8-i"), Some(ISO_8859_8_I));
|
||||
assert_eq!(Encoding::for_label(b"sun_eu_greek"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"csksc56011987"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"ks_c_5601-1987"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"ansi_x3.4-1968"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"ks_c_5601-1989"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"x-mac-cyrillic"), Some(X_MAC_CYRILLIC));
|
||||
assert_eq!(Encoding::for_label(b"x-user-defined"), Some(X_USER_DEFINED));
|
||||
assert_eq!(Encoding::for_label(b"csiso58gb231280"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-1:1987"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-2:1987"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-6:1987"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-7:1987"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-3:1988"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-4:1988"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-5:1988"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-8:1988"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-9:1989"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"csisolatingreek"), Some(ISO_8859_7));
|
||||
assert_eq!(
|
||||
Encoding::for_label(b"x-mac-ukrainian"),
|
||||
Some(X_MAC_CYRILLIC)
|
||||
);
|
||||
assert_eq!(Encoding::for_label(b"iso-2022-cn-ext"), Some(REPLACEMENT));
|
||||
assert_eq!(Encoding::for_label(b"csisolatinarabic"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"csisolatinhebrew"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"unicode-1-1-utf-8"), Some(UTF_8));
|
||||
assert_eq!(Encoding::for_label(b"csisolatincyrillic"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"cseucpkdfmtjapanese"), Some(EUC_JP));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_all_names() {
|
||||
assert_eq!(Encoding::for_name(b"GBK"), GBK);
|
||||
assert_eq!(Encoding::for_name(b"Big5"), BIG5);
|
||||
assert_eq!(Encoding::for_name(b"UTF-8"), UTF_8);
|
||||
assert_eq!(Encoding::for_name(b"IBM866"), IBM866);
|
||||
assert_eq!(Encoding::for_name(b"EUC-JP"), EUC_JP);
|
||||
assert_eq!(Encoding::for_name(b"KOI8-R"), KOI8_R);
|
||||
assert_eq!(Encoding::for_name(b"EUC-KR"), EUC_KR);
|
||||
assert_eq!(Encoding::for_name(b"KOI8-U"), KOI8_U);
|
||||
assert_eq!(Encoding::for_name(b"gb18030"), GB18030);
|
||||
assert_eq!(Encoding::for_name(b"UTF-16BE"), UTF_16BE);
|
||||
assert_eq!(Encoding::for_name(b"UTF-16LE"), UTF_16LE);
|
||||
assert_eq!(Encoding::for_name(b"Shift_JIS"), SHIFT_JIS);
|
||||
assert_eq!(Encoding::for_name(b"macintosh"), MACINTOSH);
|
||||
assert_eq!(Encoding::for_name(b"ISO-8859-2"), ISO_8859_2);
|
||||
assert_eq!(Encoding::for_name(b"ISO-8859-3"), ISO_8859_3);
|
||||
assert_eq!(Encoding::for_name(b"ISO-8859-4"), ISO_8859_4);
|
||||
assert_eq!(Encoding::for_name(b"ISO-8859-5"), ISO_8859_5);
|
||||
assert_eq!(Encoding::for_name(b"ISO-8859-6"), ISO_8859_6);
|
||||
assert_eq!(Encoding::for_name(b"ISO-8859-7"), ISO_8859_7);
|
||||
assert_eq!(Encoding::for_name(b"ISO-8859-8"), ISO_8859_8);
|
||||
assert_eq!(Encoding::for_name(b"ISO-8859-10"), ISO_8859_10);
|
||||
assert_eq!(Encoding::for_name(b"ISO-8859-13"), ISO_8859_13);
|
||||
assert_eq!(Encoding::for_name(b"ISO-8859-14"), ISO_8859_14);
|
||||
assert_eq!(Encoding::for_name(b"windows-874"), WINDOWS_874);
|
||||
assert_eq!(Encoding::for_name(b"ISO-8859-15"), ISO_8859_15);
|
||||
assert_eq!(Encoding::for_name(b"ISO-8859-16"), ISO_8859_16);
|
||||
assert_eq!(Encoding::for_name(b"ISO-2022-JP"), ISO_2022_JP);
|
||||
assert_eq!(Encoding::for_name(b"replacement"), REPLACEMENT);
|
||||
assert_eq!(Encoding::for_name(b"windows-1250"), WINDOWS_1250);
|
||||
assert_eq!(Encoding::for_name(b"windows-1251"), WINDOWS_1251);
|
||||
assert_eq!(Encoding::for_name(b"windows-1252"), WINDOWS_1252);
|
||||
assert_eq!(Encoding::for_name(b"windows-1253"), WINDOWS_1253);
|
||||
assert_eq!(Encoding::for_name(b"windows-1254"), WINDOWS_1254);
|
||||
assert_eq!(Encoding::for_name(b"windows-1255"), WINDOWS_1255);
|
||||
assert_eq!(Encoding::for_name(b"windows-1256"), WINDOWS_1256);
|
||||
assert_eq!(Encoding::for_name(b"windows-1257"), WINDOWS_1257);
|
||||
assert_eq!(Encoding::for_name(b"windows-1258"), WINDOWS_1258);
|
||||
assert_eq!(Encoding::for_name(b"ISO-8859-8-I"), ISO_8859_8_I);
|
||||
assert_eq!(Encoding::for_name(b"x-mac-cyrillic"), X_MAC_CYRILLIC);
|
||||
assert_eq!(Encoding::for_name(b"x-user-defined"), X_USER_DEFINED);
|
||||
}
|
|
@ -0,0 +1,159 @@
|
|||
// Copyright 2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
|
||||
pub fn decode(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
|
||||
let mut vec = Vec::with_capacity(bytes.len() + 32);
|
||||
let mut string = String::with_capacity(expect.len() + 32);
|
||||
for i in 0usize..32usize {
|
||||
vec.clear();
|
||||
string.clear();
|
||||
for j in 0usize..i {
|
||||
let c = 0x40u8 + (j as u8);
|
||||
vec.push(c);
|
||||
string.push(c as char);
|
||||
}
|
||||
vec.extend_from_slice(bytes);
|
||||
string.push_str(expect);
|
||||
decode_without_padding(encoding, &vec[..], &string[..]);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_without_padding(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
|
||||
decode_to_utf8(encoding, bytes, expect);
|
||||
decode_to_utf16(encoding, bytes, &utf16_from_utf8(expect)[..]);
|
||||
decode_to_string(encoding, bytes, expect);
|
||||
}
|
||||
|
||||
pub fn encode(encoding: &'static Encoding, str: &str, expect: &[u8]) {
|
||||
let mut vec = Vec::with_capacity(expect.len() + 32);
|
||||
let mut string = String::with_capacity(str.len() + 32);
|
||||
for i in 0usize..32usize {
|
||||
vec.clear();
|
||||
string.clear();
|
||||
for j in 0usize..i {
|
||||
let c = 0x40u8 + (j as u8);
|
||||
vec.push(c);
|
||||
string.push(c as char);
|
||||
}
|
||||
vec.extend_from_slice(expect);
|
||||
string.push_str(str);
|
||||
encode_without_padding(encoding, &string[..], &vec[..]);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_without_padding(encoding: &'static Encoding, string: &str, expect: &[u8]) {
|
||||
encode_from_utf8(encoding, string, expect);
|
||||
encode_from_utf16(encoding, &utf16_from_utf8(string)[..], expect);
|
||||
encode_to_vec(encoding, string, expect);
|
||||
}
|
||||
|
||||
pub fn decode_to_utf16(encoding: &'static Encoding, bytes: &[u8], expect: &[u16]) {
|
||||
let mut decoder = encoding.new_decoder();
|
||||
let mut dest: Vec<u16> =
|
||||
Vec::with_capacity(decoder.max_utf16_buffer_length(bytes.len()).unwrap());
|
||||
let capacity = dest.capacity();
|
||||
dest.resize(capacity, 0u16);
|
||||
let (complete, read, written, _) = decoder.decode_to_utf16(bytes, &mut dest, true);
|
||||
match complete {
|
||||
CoderResult::InputEmpty => {}
|
||||
CoderResult::OutputFull => {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
assert_eq!(read, bytes.len());
|
||||
assert_eq!(written, expect.len());
|
||||
dest.truncate(written);
|
||||
assert_eq!(&dest[..], expect);
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
|
||||
let mut decoder = encoding.new_decoder();
|
||||
let mut dest: Vec<u8> =
|
||||
Vec::with_capacity(decoder.max_utf8_buffer_length(bytes.len()).unwrap());
|
||||
let capacity = dest.capacity();
|
||||
dest.resize(capacity, 0u8);
|
||||
let (complete, read, written, _) = decoder.decode_to_utf8(bytes, &mut dest, true);
|
||||
match complete {
|
||||
CoderResult::InputEmpty => {}
|
||||
CoderResult::OutputFull => {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
assert_eq!(read, bytes.len());
|
||||
assert_eq!(written, expect.len());
|
||||
dest.truncate(written);
|
||||
assert_eq!(&dest[..], expect.as_bytes());
|
||||
}
|
||||
|
||||
pub fn decode_to_string(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
|
||||
let (cow, _, _) = encoding.decode(bytes);
|
||||
assert_eq!(&cow[..], expect);
|
||||
}
|
||||
|
||||
pub fn encode_from_utf8(encoding: &'static Encoding, string: &str, expect: &[u8]) {
|
||||
let mut encoder = encoding.new_encoder();
|
||||
let mut dest: Vec<u8> = Vec::with_capacity(10 * (string.len() + 1)); // 10 is replacement worst case
|
||||
let capacity = dest.capacity();
|
||||
dest.resize(capacity, 0u8);
|
||||
let (complete, read, written, _) = encoder.encode_from_utf8(string, &mut dest, true);
|
||||
match complete {
|
||||
CoderResult::InputEmpty => {}
|
||||
CoderResult::OutputFull => {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
assert_eq!(read, string.len());
|
||||
assert_eq!(written, expect.len());
|
||||
dest.truncate(written);
|
||||
assert_eq!(&dest[..], expect);
|
||||
}
|
||||
|
||||
pub fn encode_from_utf16(encoding: &'static Encoding, string: &[u16], expect: &[u8]) {
|
||||
let mut encoder = encoding.new_encoder();
|
||||
let mut dest: Vec<u8> = Vec::with_capacity(10 * (string.len() + 1)); // 10 is replacement worst case
|
||||
let capacity = dest.capacity();
|
||||
dest.resize(capacity, 0u8);
|
||||
let (complete, read, written, _) = encoder.encode_from_utf16(string, &mut dest, true);
|
||||
match complete {
|
||||
CoderResult::InputEmpty => {}
|
||||
CoderResult::OutputFull => {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
assert_eq!(read, string.len());
|
||||
// assert_eq!(written, expect.len());
|
||||
dest.truncate(written);
|
||||
assert_eq!(&dest[..], expect);
|
||||
}
|
||||
|
||||
pub fn encode_to_vec(encoding: &'static Encoding, string: &str, expect: &[u8]) {
|
||||
let (cow, _, _) = encoding.encode(string);
|
||||
assert_eq!(&cow[..], expect);
|
||||
}
|
||||
|
||||
pub fn utf16_from_utf8(string: &str) -> Vec<u16> {
|
||||
let mut decoder = UTF_8.new_decoder_without_bom_handling();
|
||||
let mut vec = Vec::with_capacity(decoder.max_utf16_buffer_length(string.len()).unwrap());
|
||||
let capacity = vec.capacity();
|
||||
vec.resize(capacity, 0);
|
||||
|
||||
let (result, read, written) =
|
||||
decoder.decode_to_utf16_without_replacement(string.as_bytes(), &mut vec[..], true);
|
||||
match result {
|
||||
DecoderResult::InputEmpty => {
|
||||
debug_assert_eq!(read, string.len());
|
||||
vec.resize(written, 0);
|
||||
vec
|
||||
}
|
||||
DecoderResult::Malformed(_, _) => unreachable!("Malformed"),
|
||||
DecoderResult::OutputFull => unreachable!("Output full"),
|
||||
}
|
||||
}
|
|
@ -0,0 +1,378 @@
|
|||
// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use handles::*;
|
||||
use variant::*;
|
||||
use super::*;
|
||||
|
||||
pub struct Utf16Decoder {
|
||||
lead_surrogate: u16, // If non-zero and pending_bmp == false, a pending lead surrogate
|
||||
lead_byte: Option<u8>,
|
||||
be: bool,
|
||||
pending_bmp: bool, // if true, lead_surrogate is actually pending BMP
|
||||
}
|
||||
|
||||
impl Utf16Decoder {
|
||||
pub fn new(big_endian: bool) -> VariantDecoder {
|
||||
VariantDecoder::Utf16(
|
||||
Utf16Decoder {
|
||||
lead_surrogate: 0,
|
||||
lead_byte: None,
|
||||
be: big_endian,
|
||||
pending_bmp: false,
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
pub fn additional_from_state(&self) -> usize() {
|
||||
1 + if self.lead_byte.is_some() { 1 } else { 0 } +
|
||||
if self.lead_surrogate == 0 { 0 } else { 2 }
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_add(
|
||||
1,
|
||||
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_add(
|
||||
1,
|
||||
checked_mul(
|
||||
3,
|
||||
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_add(
|
||||
1,
|
||||
checked_mul(
|
||||
3,
|
||||
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
decoder_functions!(
|
||||
{
|
||||
if self.pending_bmp {
|
||||
match dest.check_space_bmp() {
|
||||
Space::Full(_) => {
|
||||
return (DecoderResult::OutputFull, 0, 0);
|
||||
}
|
||||
Space::Available(destination_handle) => {
|
||||
destination_handle.write_bmp(self.lead_surrogate);
|
||||
self.pending_bmp = false;
|
||||
self.lead_surrogate = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{},
|
||||
{
|
||||
debug_assert!(!self.pending_bmp);
|
||||
if self.lead_surrogate != 0 {
|
||||
self.lead_surrogate = 0;
|
||||
match self.lead_byte {
|
||||
None => {
|
||||
return (DecoderResult::Malformed(2, 0), src_consumed, dest.written());
|
||||
}
|
||||
Some(_) => {
|
||||
self.lead_byte = None;
|
||||
return (DecoderResult::Malformed(3, 0), src_consumed, dest.written());
|
||||
}
|
||||
}
|
||||
}
|
||||
match self.lead_byte {
|
||||
None => {}
|
||||
Some(_) => {
|
||||
self.lead_byte = None;
|
||||
return (DecoderResult::Malformed(1, 0), src_consumed, dest.written());
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
match self.lead_byte {
|
||||
None => {
|
||||
self.lead_byte = Some(b);
|
||||
continue;
|
||||
}
|
||||
Some(lead) => {
|
||||
self.lead_byte = None;
|
||||
let code_unit = if self.be {
|
||||
(lead as u16) << 8 | b as u16
|
||||
} else {
|
||||
(b as u16) << 8 | (lead as u16)
|
||||
};
|
||||
let high_bits = code_unit & 0xFC00u16;
|
||||
if high_bits == 0xD800u16 {
|
||||
// high surrogate
|
||||
if self.lead_surrogate != 0 {
|
||||
// The previous high surrogate was in
|
||||
// error and this one becomes the new
|
||||
// pending one.
|
||||
self.lead_surrogate = code_unit as u16;
|
||||
return (DecoderResult::Malformed(2, 2),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written());
|
||||
}
|
||||
self.lead_surrogate = code_unit;
|
||||
continue;
|
||||
}
|
||||
if high_bits == 0xDC00u16 {
|
||||
// low surrogate
|
||||
if self.lead_surrogate == 0 {
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written());
|
||||
}
|
||||
destination_handle.write_surrogate_pair(self.lead_surrogate, code_unit);
|
||||
self.lead_surrogate = 0;
|
||||
continue;
|
||||
}
|
||||
// bmp
|
||||
if self.lead_surrogate != 0 {
|
||||
// The previous high surrogate was in
|
||||
// error and this code unit becomes a
|
||||
// pending BMP character.
|
||||
self.lead_surrogate = code_unit;
|
||||
self.pending_bmp = true;
|
||||
return (DecoderResult::Malformed(2, 2),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written());
|
||||
}
|
||||
destination_handle.write_bmp(code_unit);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
},
|
||||
self,
|
||||
src_consumed,
|
||||
dest,
|
||||
source,
|
||||
b,
|
||||
destination_handle,
|
||||
unread_handle,
|
||||
check_space_astral
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_utf_16le(bytes: &[u8], expect: &str) {
|
||||
decode_without_padding(UTF_16LE, bytes, expect);
|
||||
}
|
||||
|
||||
fn decode_utf_16be(bytes: &[u8], expect: &str) {
|
||||
decode_without_padding(UTF_16BE, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_utf_16le(string: &str, expect: &[u8]) {
|
||||
encode(UTF_16LE, string, expect);
|
||||
}
|
||||
|
||||
fn encode_utf_16be(string: &str, expect: &[u8]) {
|
||||
encode(UTF_16BE, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16_decode() {
|
||||
decode_utf_16le(b"", "");
|
||||
decode_utf_16be(b"", "");
|
||||
|
||||
decode_utf_16le(b"\x61\x00\x62\x00", "\u{0061}\u{0062}");
|
||||
decode_utf_16be(b"\x00\x61\x00\x62", "\u{0061}\u{0062}");
|
||||
|
||||
decode_utf_16le(b"\xFE\xFF\x00\x61\x00\x62", "\u{0061}\u{0062}");
|
||||
decode_utf_16be(b"\xFF\xFE\x61\x00\x62\x00", "\u{0061}\u{0062}");
|
||||
|
||||
decode_utf_16le(b"\x61\x00\x62", "\u{0061}\u{FFFD}");
|
||||
decode_utf_16be(b"\x00\x61\x00", "\u{0061}\u{FFFD}");
|
||||
|
||||
decode_utf_16le(b"\x3D\xD8\xA9", "\u{FFFD}");
|
||||
decode_utf_16be(b"\xD8\x3D\xDC", "\u{FFFD}");
|
||||
|
||||
decode_utf_16le(b"\x3D\xD8\xA9\xDC\x03\x26", "\u{1F4A9}\u{2603}");
|
||||
decode_utf_16be(b"\xD8\x3D\xDC\xA9\x26\x03", "\u{1F4A9}\u{2603}");
|
||||
|
||||
decode_utf_16le(b"\xA9\xDC\x03\x26", "\u{FFFD}\u{2603}");
|
||||
decode_utf_16be(b"\xDC\xA9\x26\x03", "\u{FFFD}\u{2603}");
|
||||
|
||||
decode_utf_16le(b"\x3D\xD8\x03\x26", "\u{FFFD}\u{2603}");
|
||||
decode_utf_16be(b"\xD8\x3D\x26\x03", "\u{FFFD}\u{2603}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16_encode() {
|
||||
// Empty
|
||||
encode_utf_16be("", b"");
|
||||
encode_utf_16le("", b"");
|
||||
|
||||
// Encodes as UTF-8
|
||||
assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
|
||||
assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
|
||||
encode_utf_16le("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
|
||||
encode_utf_16be("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16be_decode_one_by_one() {
|
||||
let input = b"\x00\x61\x00\xE4\x26\x03\xD8\x3D\xDC\xA9";
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16BE.new_decoder();
|
||||
for b in input.chunks(1) {
|
||||
assert_eq!(b.len(), 1);
|
||||
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
|
||||
let (result, read, _, had_errors) =
|
||||
decoder.decode_to_utf16(b, &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert!(!had_errors);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16le_decode_one_by_one() {
|
||||
let input = b"\x61\x00\xE4\x00\x03\x26\x3D\xD8\xA9\xDC";
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16LE.new_decoder();
|
||||
for b in input.chunks(1) {
|
||||
assert_eq!(b.len(), 1);
|
||||
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
|
||||
let (result, read, _, had_errors) =
|
||||
decoder.decode_to_utf16(b, &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert!(!had_errors);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16be_decode_three_at_a_time() {
|
||||
let input = b"\x00\xE4\x26\x03\xD8\x3D\xDC\xA9\x00\x61\x00\xE4";
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16BE.new_decoder();
|
||||
for b in input.chunks(3) {
|
||||
assert_eq!(b.len(), 3);
|
||||
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
|
||||
let (result, read, _, had_errors) =
|
||||
decoder.decode_to_utf16(b, &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, b.len());
|
||||
assert!(!had_errors);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16le_decode_three_at_a_time() {
|
||||
let input = b"\xE4\x00\x03\x26\x3D\xD8\xA9\xDC\x61\x00\xE4\x00";
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16LE.new_decoder();
|
||||
for b in input.chunks(3) {
|
||||
assert_eq!(b.len(), 3);
|
||||
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
|
||||
let (result, read, _, had_errors) =
|
||||
decoder.decode_to_utf16(b, &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, b.len());
|
||||
assert!(!had_errors);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16le_decode_bom_prefixed_split_byte_pair() {
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16LE.new_decoder();
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFF", &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 0);
|
||||
assert!(!had_errors);
|
||||
}
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 1);
|
||||
assert!(!had_errors);
|
||||
assert_eq!(output[0], 0xFDFF);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16be_decode_bom_prefixed_split_byte_pair() {
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16BE.new_decoder();
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFE", &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 0);
|
||||
assert!(!had_errors);
|
||||
}
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 1);
|
||||
assert!(!had_errors);
|
||||
assert_eq!(output[0], 0xFEFD);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16le_decode_bom_prefix() {
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16LE.new_decoder();
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFF", &mut output[..needed], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 1);
|
||||
assert!(had_errors);
|
||||
assert_eq!(output[0], 0xFFFD);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16be_decode_bom_prefix() {
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16BE.new_decoder();
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFE", &mut output[..needed], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 1);
|
||||
assert!(had_errors);
|
||||
assert_eq!(output[0], 0xFFFD);
|
||||
}
|
||||
}
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,180 @@
|
|||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// The initial revision of this file was extracted from the "UTF-8 validation"
|
||||
// section of the file src/libcore/str/mod.rs from Rust project at revision
|
||||
// 7ad7232422f7e5bbfa0e52dabe36c12677df19e2. The Utf8Error struct also comes
|
||||
// from that file.
|
||||
|
||||
use ascii::validate_ascii;
|
||||
|
||||
/// Errors which can occur when attempting to interpret a sequence of `u8`
|
||||
/// as a string.
|
||||
///
|
||||
/// As such, the `from_utf8` family of functions and methods for both `String`s
|
||||
/// and `&str`s make use of this error, for example.
|
||||
#[derive(Copy, Eq, PartialEq, Clone, Debug)]
|
||||
pub struct Utf8Error {
|
||||
valid_up_to: usize,
|
||||
}
|
||||
|
||||
impl Utf8Error {
|
||||
/// Returns the index in the given string up to which valid UTF-8 was
|
||||
/// verified.
|
||||
///
|
||||
/// It is the maximum index such that `from_utf8(input[..index])`
|
||||
/// would return `Ok(_)`.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Basic usage:
|
||||
///
|
||||
/// ```
|
||||
/// use std::str;
|
||||
///
|
||||
/// // some invalid bytes, in a vector
|
||||
/// let sparkle_heart = vec![0, 159, 146, 150];
|
||||
///
|
||||
/// // std::str::from_utf8 returns a Utf8Error
|
||||
/// let error = str::from_utf8(&sparkle_heart).unwrap_err();
|
||||
///
|
||||
/// // the second byte is invalid here
|
||||
/// assert_eq!(1, error.valid_up_to());
|
||||
/// ```
|
||||
pub fn valid_up_to(&self) -> usize {
|
||||
self.valid_up_to
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(eval_order_dependence))]
|
||||
#[inline(always)]
|
||||
pub fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
|
||||
let mut offset = 0;
|
||||
let len = v.len();
|
||||
'outer: loop {
|
||||
let mut first = {
|
||||
let remaining = &v[offset..];
|
||||
match validate_ascii(remaining) {
|
||||
None => {
|
||||
// offset += remaining.len();
|
||||
break 'outer;
|
||||
}
|
||||
Some((non_ascii, consumed)) => {
|
||||
offset += consumed;
|
||||
non_ascii
|
||||
}
|
||||
}
|
||||
};
|
||||
let old_offset = offset;
|
||||
macro_rules! err { () => {{
|
||||
return Err(Utf8Error {
|
||||
valid_up_to: old_offset
|
||||
})
|
||||
}}}
|
||||
|
||||
macro_rules! next { () => {{
|
||||
offset += 1;
|
||||
// we needed data, but there was none: error!
|
||||
if offset >= len {
|
||||
err!()
|
||||
}
|
||||
v[offset]
|
||||
}}}
|
||||
'inner: loop {
|
||||
// Intuitively, it would make sense to check availability for
|
||||
// a four-byte sequence here, not check per byte and handle the
|
||||
// end of the buffer as a special case. For some reason, that
|
||||
// disturbs something in a way that would make things slower.
|
||||
let second = next!();
|
||||
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
|
||||
// first C2 80 last DF BF
|
||||
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
|
||||
// first E0 A0 80 last EF BF BF
|
||||
// excluding surrogates codepoints \u{d800} to \u{dfff}
|
||||
// ED A0 80 to ED BF BF
|
||||
// 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
|
||||
// first F0 90 80 80 last F4 8F BF BF
|
||||
//
|
||||
// Use the UTF-8 syntax from the RFC
|
||||
//
|
||||
// https://tools.ietf.org/html/rfc3629
|
||||
// UTF8-1 = %x00-7F
|
||||
// UTF8-2 = %xC2-DF UTF8-tail
|
||||
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
|
||||
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
|
||||
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
|
||||
// %xF4 %x80-8F 2( UTF8-tail )
|
||||
match first {
|
||||
0xC2...0xDF => {
|
||||
if second & !CONT_MASK != TAG_CONT_U8 {
|
||||
err!()
|
||||
}
|
||||
}
|
||||
0xE0 => {
|
||||
match (second, next!() & !CONT_MASK) {
|
||||
(0xA0...0xBF, TAG_CONT_U8) => {}
|
||||
_ => err!(),
|
||||
}
|
||||
}
|
||||
0xE1...0xEC | 0xEE...0xEF => {
|
||||
match (second & !CONT_MASK, next!() & !CONT_MASK) {
|
||||
(TAG_CONT_U8, TAG_CONT_U8) => {}
|
||||
_ => err!(),
|
||||
}
|
||||
}
|
||||
0xED => {
|
||||
match (second, next!() & !CONT_MASK) {
|
||||
(0x80...0x9F, TAG_CONT_U8) => {}
|
||||
_ => err!(),
|
||||
}
|
||||
}
|
||||
0xF0 => {
|
||||
match (second, next!() & !CONT_MASK, next!() & !CONT_MASK) {
|
||||
(0x90...0xBF, TAG_CONT_U8, TAG_CONT_U8) => {}
|
||||
_ => err!(),
|
||||
}
|
||||
}
|
||||
0xF1...0xF3 => {
|
||||
match (second & !CONT_MASK, next!() & !CONT_MASK, next!() & !CONT_MASK) {
|
||||
(TAG_CONT_U8, TAG_CONT_U8, TAG_CONT_U8) => {}
|
||||
_ => err!(),
|
||||
}
|
||||
}
|
||||
0xF4 => {
|
||||
match (second, next!() & !CONT_MASK, next!() & !CONT_MASK) {
|
||||
(0x80...0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
|
||||
_ => err!(),
|
||||
}
|
||||
}
|
||||
_ => err!(),
|
||||
}
|
||||
offset += 1;
|
||||
if offset == len {
|
||||
break 'outer;
|
||||
}
|
||||
first = v[offset];
|
||||
// This check is separate from the above `match`, because merging
|
||||
// this check into it causes a spectacular performance drop
|
||||
// (over twice as slow).
|
||||
if first < 0x80 {
|
||||
offset += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
continue 'inner;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Mask of the value bits of a continuation byte
|
||||
const CONT_MASK: u8 = 0b0011_1111;
|
||||
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
|
||||
const TAG_CONT_U8: u8 = 0b1000_0000;
|
|
@ -0,0 +1,337 @@
|
|||
// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
|
||||
// Instead, please regenerate using generate-encoding-data.py
|
||||
|
||||
//! This module provides enums that wrap the various decoders and encoders.
|
||||
//! The purpose is to make `Decoder` and `Encoder` `Sized` by writing the
|
||||
//! dispatch explicitly for a finite set of specialized decoders and encoders.
|
||||
//! Unfortunately, this means the compiler doesn't generate the dispatch code
|
||||
//! and it has to be written here instead.
|
||||
//!
|
||||
//! The purpose of making `Decoder` and `Encoder` `Sized` is to allow stack
|
||||
//! allocation in Rust code, including the convenience methods on `Encoding`.
|
||||
|
||||
use single_byte::*;
|
||||
use utf_8::*;
|
||||
use gb18030::*;
|
||||
use big5::*;
|
||||
use euc_jp::*;
|
||||
use iso_2022_jp::*;
|
||||
use shift_jis::*;
|
||||
use euc_kr::*;
|
||||
use replacement::*;
|
||||
use x_user_defined::*;
|
||||
use utf_16::*;
|
||||
use super::*;
|
||||
|
||||
pub enum VariantDecoder {
|
||||
SingleByte(SingleByteDecoder),
|
||||
Utf8(Utf8Decoder),
|
||||
Gb18030(Gb18030Decoder),
|
||||
Big5(Big5Decoder),
|
||||
EucJp(EucJpDecoder),
|
||||
Iso2022Jp(Iso2022JpDecoder),
|
||||
ShiftJis(ShiftJisDecoder),
|
||||
EucKr(EucKrDecoder),
|
||||
Replacement(ReplacementDecoder),
|
||||
UserDefined(UserDefinedDecoder),
|
||||
Utf16(Utf16Decoder),
|
||||
}
|
||||
|
||||
impl VariantDecoder {
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
match *self {
|
||||
VariantDecoder::SingleByte(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Utf8(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Gb18030(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Big5(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::EucJp(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Iso2022Jp(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::ShiftJis(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::EucKr(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Replacement(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::UserDefined(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Utf16(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
match *self {
|
||||
VariantDecoder::SingleByte(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Utf8(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Gb18030(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Big5(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::EucJp(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Iso2022Jp(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::ShiftJis(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::EucKr(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Replacement(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::UserDefined(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Utf16(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
match *self {
|
||||
VariantDecoder::SingleByte(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Utf8(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Gb18030(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Big5(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::EucJp(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Iso2022Jp(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::ShiftJis(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::EucKr(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Replacement(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::UserDefined(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Utf16(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_to_utf16_raw(&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u16],
|
||||
last: bool)
|
||||
-> (DecoderResult, usize, usize) {
|
||||
match *self {
|
||||
VariantDecoder::SingleByte(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Utf8(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Gb18030(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Big5(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::EucJp(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Iso2022Jp(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::ShiftJis(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::EucKr(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Replacement(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::UserDefined(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Utf16(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8_raw(&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u8],
|
||||
last: bool)
|
||||
-> (DecoderResult, usize, usize) {
|
||||
match *self {
|
||||
VariantDecoder::SingleByte(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Utf8(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Gb18030(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Big5(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::EucJp(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Iso2022Jp(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::ShiftJis(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::EucKr(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Replacement(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::UserDefined(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Utf16(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum VariantEncoder {
|
||||
SingleByte(SingleByteEncoder),
|
||||
Utf8(Utf8Encoder),
|
||||
Gb18030(Gb18030Encoder),
|
||||
Big5(Big5Encoder),
|
||||
EucJp(EucJpEncoder),
|
||||
Iso2022Jp(Iso2022JpEncoder),
|
||||
ShiftJis(ShiftJisEncoder),
|
||||
EucKr(EucKrEncoder),
|
||||
UserDefined(UserDefinedEncoder),
|
||||
}
|
||||
|
||||
impl VariantEncoder {
|
||||
pub fn has_pending_state(&self) -> bool {
|
||||
match *self {
|
||||
VariantEncoder::Iso2022Jp(ref v) => v.has_pending_state(),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(&self,
|
||||
u16_length: usize)
|
||||
-> Option<usize> {
|
||||
match *self {
|
||||
VariantEncoder::SingleByte(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::Utf8(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::Gb18030(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::Big5(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::EucJp(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::Iso2022Jp(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::ShiftJis(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::EucKr(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::UserDefined(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(&self,
|
||||
byte_length: usize)
|
||||
-> Option<usize> {
|
||||
match *self {
|
||||
VariantEncoder::SingleByte(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::Utf8(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::Gb18030(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::Big5(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::EucJp(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::Iso2022Jp(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::ShiftJis(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::EucKr(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::UserDefined(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_from_utf16_raw(&mut self,
|
||||
src: &[u16],
|
||||
dst: &mut [u8],
|
||||
last: bool)
|
||||
-> (EncoderResult, usize, usize) {
|
||||
match *self {
|
||||
VariantEncoder::SingleByte(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::Utf8(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::Gb18030(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::Big5(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::EucJp(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::Iso2022Jp(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::ShiftJis(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::EucKr(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::UserDefined(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_from_utf8_raw(&mut self,
|
||||
src: &str,
|
||||
dst: &mut [u8],
|
||||
last: bool)
|
||||
-> (EncoderResult, usize, usize) {
|
||||
match *self {
|
||||
VariantEncoder::SingleByte(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::Utf8(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::Gb18030(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::Big5(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::EucJp(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::Iso2022Jp(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::ShiftJis(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::EucKr(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::UserDefined(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum VariantEncoding {
|
||||
SingleByte(&'static [u16; 128]),
|
||||
Utf8,
|
||||
Gbk,
|
||||
Gb18030,
|
||||
Big5,
|
||||
EucJp,
|
||||
Iso2022Jp,
|
||||
ShiftJis,
|
||||
EucKr,
|
||||
Replacement,
|
||||
Utf16Be,
|
||||
Utf16Le,
|
||||
UserDefined,
|
||||
}
|
||||
|
||||
impl VariantEncoding {
|
||||
pub fn new_variant_decoder(&self) -> VariantDecoder {
|
||||
match *self {
|
||||
VariantEncoding::SingleByte(table) => SingleByteDecoder::new(table),
|
||||
VariantEncoding::Utf8 => Utf8Decoder::new(),
|
||||
VariantEncoding::Gbk |
|
||||
VariantEncoding::Gb18030 => Gb18030Decoder::new(),
|
||||
VariantEncoding::Big5 => Big5Decoder::new(),
|
||||
VariantEncoding::EucJp => EucJpDecoder::new(),
|
||||
VariantEncoding::Iso2022Jp => Iso2022JpDecoder::new(),
|
||||
VariantEncoding::ShiftJis => ShiftJisDecoder::new(),
|
||||
VariantEncoding::EucKr => EucKrDecoder::new(),
|
||||
VariantEncoding::Replacement => ReplacementDecoder::new(),
|
||||
VariantEncoding::UserDefined => UserDefinedDecoder::new(),
|
||||
VariantEncoding::Utf16Be => Utf16Decoder::new(true),
|
||||
VariantEncoding::Utf16Le => Utf16Decoder::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder {
|
||||
match *self {
|
||||
VariantEncoding::SingleByte(table) => SingleByteEncoder::new(encoding, table),
|
||||
VariantEncoding::Utf8 => Utf8Encoder::new(encoding),
|
||||
VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false),
|
||||
VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true),
|
||||
VariantEncoding::Big5 => Big5Encoder::new(encoding),
|
||||
VariantEncoding::EucJp => EucJpEncoder::new(encoding),
|
||||
VariantEncoding::Iso2022Jp => Iso2022JpEncoder::new(encoding),
|
||||
VariantEncoding::ShiftJis => ShiftJisEncoder::new(encoding),
|
||||
VariantEncoding::EucKr => EucKrEncoder::new(encoding),
|
||||
VariantEncoding::UserDefined => UserDefinedEncoder::new(encoding),
|
||||
VariantEncoding::Utf16Be |
|
||||
VariantEncoding::Replacement |
|
||||
VariantEncoding::Utf16Le => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,142 @@
|
|||
// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use handles::*;
|
||||
use variant::*;
|
||||
use super::*;
|
||||
|
||||
pub struct UserDefinedDecoder;
|
||||
|
||||
impl UserDefinedDecoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::UserDefined(UserDefinedDecoder)
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
Some(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_mul(3)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_mul(3)
|
||||
}
|
||||
|
||||
decoder_functions!(
|
||||
{},
|
||||
{},
|
||||
{},
|
||||
{
|
||||
if b < 0x80 {
|
||||
// XXX optimize ASCII
|
||||
destination_handle.write_ascii(b);
|
||||
continue;
|
||||
}
|
||||
destination_handle.write_upper_bmp((b as usize + 0xF700usize) as u16);
|
||||
continue;
|
||||
},
|
||||
self,
|
||||
src_consumed,
|
||||
dest,
|
||||
source,
|
||||
b,
|
||||
destination_handle,
|
||||
_unread_handle,
|
||||
check_space_bmp
|
||||
);
|
||||
}
|
||||
|
||||
pub struct UserDefinedEncoder;
|
||||
|
||||
impl UserDefinedEncoder {
|
||||
pub fn new(encoding: &'static Encoding) -> Encoder {
|
||||
Encoder::new(encoding, VariantEncoder::UserDefined(UserDefinedEncoder))
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(&self,
|
||||
u16_length: usize)
|
||||
-> Option<usize> {
|
||||
Some(u16_length)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(&self,
|
||||
byte_length: usize)
|
||||
-> Option<usize> {
|
||||
Some(byte_length)
|
||||
}
|
||||
|
||||
encoder_functions!(
|
||||
{},
|
||||
{
|
||||
if c <= '\u{7F}' {
|
||||
// TODO optimize ASCII run
|
||||
destination_handle.write_one(c as u8);
|
||||
continue;
|
||||
}
|
||||
if c < '\u{F780}' || c > '\u{F7FF}' {
|
||||
return (EncoderResult::Unmappable(c),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written());
|
||||
}
|
||||
destination_handle.write_one((c as usize - 0xF700usize) as u8);
|
||||
continue;
|
||||
},
|
||||
self,
|
||||
src_consumed,
|
||||
source,
|
||||
dest,
|
||||
c,
|
||||
destination_handle,
|
||||
unread_handle,
|
||||
check_space_one
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_x_user_defined(bytes: &[u8], expect: &str) {
|
||||
decode(X_USER_DEFINED, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_x_user_defined(string: &str, expect: &[u8]) {
|
||||
encode(X_USER_DEFINED, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_x_user_defined_decode() {
|
||||
// Empty
|
||||
decode_x_user_defined(b"", "");
|
||||
|
||||
// ASCII
|
||||
decode_x_user_defined(b"\x61\x62", "\u{0061}\u{0062}");
|
||||
|
||||
decode_x_user_defined(b"\x80\xFF", "\u{F780}\u{F7FF}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_x_user_defined_encode() {
|
||||
// Empty
|
||||
encode_x_user_defined("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_x_user_defined("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
encode_x_user_defined("\u{F780}\u{F7FF}", b"\x80\xFF");
|
||||
encode_x_user_defined("\u{F77F}\u{F800}", b"");
|
||||
}
|
||||
|
||||
}
|
|
@ -278,6 +278,22 @@ dependencies = [
|
|||
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_c"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"encoding_rs 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.6.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"cfg-if 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "env_logger"
|
||||
version = "0.4.1"
|
||||
|
@ -355,6 +371,7 @@ name = "gkrust-shared"
|
|||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cubeb-pulse 0.0.1",
|
||||
"encoding_c 0.7.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"geckoservo 0.0.1",
|
||||
"mp4parse_capi 0.8.0",
|
||||
"nserror 0.1.0",
|
||||
|
@ -1208,6 +1225,8 @@ dependencies = [
|
|||
"checksum cssparser-macros 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "079adec4af52bb5275eadd004292028c79eb3c5f5b4ee8086a36d4197032f6df"
|
||||
"checksum deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1614659040e711785ed8ea24219140654da1729f3ec8a47a9719d041112fe7bf"
|
||||
"checksum dwrote 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "74114b6b49d6731835da7a28a3642651451e315f7f9b9d04e907e65a45681796"
|
||||
"checksum encoding_c 0.7.4 (registry+https://github.com/rust-lang/crates.io-index)" = "45ef700aebe8c5fb44f081a54ab400f4f6b002a426bc5332381c108f49713432"
|
||||
"checksum encoding_rs 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)" = "e00a1b1e95eb46988805ceee6f34cd95c46a6753e290cb3ff0486931989d4a4c"
|
||||
"checksum env_logger 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ed39959122ea027670b704fb70539f4286ddf4a49eefede23bf0b4b2a069ec03"
|
||||
"checksum euclid 0.13.0 (registry+https://github.com/rust-lang/crates.io-index)" = "6083f113c422ff9cd855a1cf6cc8ec0903606c0eb43a0c6a0ced3bdc9731e4c1"
|
||||
"checksum fnv 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6cc484842f1e2884faf56f529f960cc12ad8c71ce96cc7abba0a067c98fee344"
|
||||
|
|
|
@ -276,6 +276,22 @@ dependencies = [
|
|||
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_c"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"encoding_rs 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.6.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"cfg-if 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "env_logger"
|
||||
version = "0.4.1"
|
||||
|
@ -353,6 +369,7 @@ name = "gkrust-shared"
|
|||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cubeb-pulse 0.0.1",
|
||||
"encoding_c 0.7.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"geckoservo 0.0.1",
|
||||
"mp4parse_capi 0.8.0",
|
||||
"nserror 0.1.0",
|
||||
|
@ -1195,6 +1212,8 @@ dependencies = [
|
|||
"checksum cssparser-macros 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "079adec4af52bb5275eadd004292028c79eb3c5f5b4ee8086a36d4197032f6df"
|
||||
"checksum deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1614659040e711785ed8ea24219140654da1729f3ec8a47a9719d041112fe7bf"
|
||||
"checksum dwrote 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "74114b6b49d6731835da7a28a3642651451e315f7f9b9d04e907e65a45681796"
|
||||
"checksum encoding_c 0.7.4 (registry+https://github.com/rust-lang/crates.io-index)" = "45ef700aebe8c5fb44f081a54ab400f4f6b002a426bc5332381c108f49713432"
|
||||
"checksum encoding_rs 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)" = "e00a1b1e95eb46988805ceee6f34cd95c46a6753e290cb3ff0486931989d4a4c"
|
||||
"checksum env_logger 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ed39959122ea027670b704fb70539f4286ddf4a49eefede23bf0b4b2a069ec03"
|
||||
"checksum euclid 0.13.0 (registry+https://github.com/rust-lang/crates.io-index)" = "6083f113c422ff9cd855a1cf6cc8ec0903606c0eb43a0c6a0ced3bdc9731e4c1"
|
||||
"checksum fnv 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6cc484842f1e2884faf56f529f960cc12ad8c71ce96cc7abba0a067c98fee344"
|
||||
|
|
|
@ -13,6 +13,7 @@ nserror = { path = "../../../../xpcom/rust/nserror" }
|
|||
rust_url_capi = { path = "../../../../netwerk/base/rust-url-capi" }
|
||||
webrender_bindings = { path = "../../../../gfx/webrender_bindings", optional = true }
|
||||
cubeb-pulse = { path = "../../../../media/libcubeb/cubeb-pulse-rs", optional = true, features=["pulse-dlopen"] }
|
||||
encoding_c = "0.7.1"
|
||||
|
||||
[features]
|
||||
default = []
|
||||
|
@ -21,6 +22,9 @@ servo = ["geckoservo"]
|
|||
quantum_render = ["webrender_bindings"]
|
||||
cubeb_pulse_rust = ["cubeb-pulse"]
|
||||
gecko_debug = ["geckoservo/gecko_debug"]
|
||||
simd-accel = ["encoding_c/simd-accel"]
|
||||
no-static-ideograph-encoder-tables = ["encoding_c/no-static-ideograph-encoder-tables"]
|
||||
parallel-utf8 = ["encoding_c/parallel-utf8"]
|
||||
|
||||
[lib]
|
||||
path = "lib.rs"
|
||||
|
|
|
@ -13,6 +13,7 @@ extern crate rust_url_capi;
|
|||
extern crate webrender_bindings;
|
||||
#[cfg(feature = "cubeb_pulse_rust")]
|
||||
extern crate cubeb_pulse;
|
||||
extern crate encoding_c;
|
||||
|
||||
use std::boxed::Box;
|
||||
use std::ffi::CStr;
|
||||
|
|
Загрузка…
Ссылка в новой задаче