pluotsorbet/libs/encoding.js

2435 строки
68 KiB
JavaScript

// Copyright 2014 Joshua Bell. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// If we're in node require encoding-indexes and attach it to the global.
if (typeof module !== "undefined" && module.exports) {
this["encoding-indexes"] = require("./encoding-indexes.js")["encoding-indexes"];
}
(function(global) {
'use strict';
//
// Utilities
//
/**
* @param {number} a The number to test.
* @param {number} min The minimum value in the range, inclusive.
* @param {number} max The maximum value in the range, inclusive.
* @return {boolean} True if a >= min and a <= max.
*/
function inRange(a, min, max) {
return min <= a && a <= max;
}
/**
* @param {number} n The numerator.
* @param {number} d The denominator.
* @return {number} The result of the integer division of n by d.
*/
function div(n, d) {
return Math.floor(n / d);
}
//
// Implementation of Encoding specification
// http://dvcs.w3.org/hg/encoding/raw-file/tip/Overview.html
//
//
// 3. Terminology
//
//
// 4. Encodings
//
/** @const */ var EOF_byte = -1;
/** @const */ var EOF_code_point = -1;
/**
* @constructor
* @param {Uint8Array} bytes Array of bytes that provide the stream.
*/
function ByteInputStream(bytes) {
/** @type {number} */
var pos = 0;
/**
* @this {ByteInputStream}
* @return {number} Get the next byte from the stream.
*/
this.get = function() {
return (pos >= bytes.length) ? EOF_byte : Number(bytes[pos]);
};
/** @param {number} n Number (positive or negative) by which to
* offset the byte pointer. */
this.offset = function(n) {
pos += n;
if (pos < 0) {
throw new Error('Seeking past start of the buffer');
}
if (pos > bytes.length) {
throw new Error('Seeking past EOF');
}
};
/**
* @param {Array.<number>} test Array of bytes to compare against.
* @return {boolean} True if the start of the stream matches the test
* bytes.
*/
this.match = function(test) {
if (test.length > pos + bytes.length) {
return false;
}
var i;
for (i = 0; i < test.length; i += 1) {
if (Number(bytes[pos + i]) !== test[i]) {
return false;
}
}
return true;
};
}
/**
* @constructor
* @param {Array.<number>} bytes The array to write bytes into.
*/
function ByteOutputStream(bytes) {
/** @type {number} */
var pos = 0;
/**
* @param {...number} var_args The byte or bytes to emit into the stream.
* @return {number} The last byte emitted.
*/
this.emit = function(var_args) {
/** @type {number} */
var last = EOF_byte;
var i;
for (i = 0; i < arguments.length; ++i) {
last = Number(arguments[i]);
bytes[pos++] = last;
}
return last;
};
}
/**
* @constructor
* @param {string} string The source of code units for the stream.
*/
function CodePointInputStream(string) {
/**
* @param {string} string Input string of UTF-16 code units.
* @return {Array.<number>} Code points.
*/
function stringToCodePoints(string) {
/** @type {Array.<number>} */
var cps = [];
// Based on http://www.w3.org/TR/WebIDL/#idl-DOMString
var i = 0, n = string.length;
while (i < string.length) {
var c = string.charCodeAt(i);
if (!inRange(c, 0xD800, 0xDFFF)) {
cps.push(c);
} else if (inRange(c, 0xDC00, 0xDFFF)) {
cps.push(0xFFFD);
} else { // (inRange(cu, 0xD800, 0xDBFF))
if (i === n - 1) {
cps.push(0xFFFD);
} else {
var d = string.charCodeAt(i + 1);
if (inRange(d, 0xDC00, 0xDFFF)) {
var a = c & 0x3FF;
var b = d & 0x3FF;
i += 1;
cps.push(0x10000 + (a << 10) + b);
} else {
cps.push(0xFFFD);
}
}
}
i += 1;
}
return cps;
}
/** @type {number} */
var pos = 0;
/** @type {Array.<number>} */
var cps = stringToCodePoints(string);
/** @param {number} n The number of bytes (positive or negative)
* to advance the code point pointer by.*/
this.offset = function(n) {
pos += n;
if (pos < 0) {
throw new Error('Seeking past start of the buffer');
}
if (pos > cps.length) {
throw new Error('Seeking past EOF');
}
};
/** @return {number} Get the next code point from the stream. */
this.get = function() {
if (pos >= cps.length) {
return EOF_code_point;
}
return cps[pos];
};
}
/**
* @constructor
*/
function CodePointOutputStream() {
/** @type {string} */
var string = '';
/** @return {string} The accumulated string. */
this.string = function() {
return string;
};
/** @param {number} c The code point to encode into the stream. */
this.emit = function(c) {
if (c <= 0xFFFF) {
string += String.fromCharCode(c);
} else {
c -= 0x10000;
string += String.fromCharCode(0xD800 + ((c >> 10) & 0x3ff));
string += String.fromCharCode(0xDC00 + (c & 0x3ff));
}
};
}
/**
* @constructor
* @param {string} message Description of the error.
*/
function EncodingError(message) {
this.name = 'EncodingError';
this.message = message;
this.code = 0;
}
EncodingError.prototype = Error.prototype;
/**
* @param {boolean} fatal If true, decoding errors raise an exception.
* @param {number=} opt_code_point Override the standard fallback code point.
* @return {number} The code point to insert on a decoding error.
*/
function decoderError(fatal, opt_code_point) {
if (fatal) {
throw new EncodingError('Decoder error');
}
return opt_code_point || 0xFFFD;
}
/**
* @param {number} code_point The code point that could not be encoded.
* @return {number} Always throws, no value is actually returned.
*/
function encoderError(code_point) {
throw new EncodingError('The code point ' + code_point +
' could not be encoded.');
}
/**
* @param {string} label The encoding label.
* @return {?{name:string,labels:Array.<string>}}
*/
function getEncoding(label) {
label = String(label).trim().toLowerCase();
if (Object.prototype.hasOwnProperty.call(label_to_encoding, label)) {
return label_to_encoding[label];
}
return null;
}
/** @type {Array.<{encodings: Array.<{name:string,labels:Array.<string>}>,
* heading: string}>} */
var encodings = [
{
"encodings": [
{
"labels": [
"unicode-1-1-utf-8",
"utf-8",
"utf8"
],
"name": "utf-8"
}
],
"heading": "The Encoding"
},
{
"encodings": [
{
"labels": [
"866",
"cp866",
"csibm866",
"ibm866"
],
"name": "ibm866"
},
{
"labels": [
"csisolatin2",
"iso-8859-2",
"iso-ir-101",
"iso8859-2",
"iso88592",
"iso_8859-2",
"iso_8859-2:1987",
"l2",
"latin2"
],
"name": "iso-8859-2"
},
{
"labels": [
"csisolatin3",
"iso-8859-3",
"iso-ir-109",
"iso8859-3",
"iso88593",
"iso_8859-3",
"iso_8859-3:1988",
"l3",
"latin3"
],
"name": "iso-8859-3"
},
{
"labels": [
"csisolatin4",
"iso-8859-4",
"iso-ir-110",
"iso8859-4",
"iso88594",
"iso_8859-4",
"iso_8859-4:1988",
"l4",
"latin4"
],
"name": "iso-8859-4"
},
{
"labels": [
"csisolatincyrillic",
"cyrillic",
"iso-8859-5",
"iso-ir-144",
"iso8859-5",
"iso88595",
"iso_8859-5",
"iso_8859-5:1988"
],
"name": "iso-8859-5"
},
{
"labels": [
"arabic",
"asmo-708",
"csiso88596e",
"csiso88596i",
"csisolatinarabic",
"ecma-114",
"iso-8859-6",
"iso-8859-6-e",
"iso-8859-6-i",
"iso-ir-127",
"iso8859-6",
"iso88596",
"iso_8859-6",
"iso_8859-6:1987"
],
"name": "iso-8859-6"
},
{
"labels": [
"csisolatingreek",
"ecma-118",
"elot_928",
"greek",
"greek8",
"iso-8859-7",
"iso-ir-126",
"iso8859-7",
"iso88597",
"iso_8859-7",
"iso_8859-7:1987",
"sun_eu_greek"
],
"name": "iso-8859-7"
},
{
"labels": [
"csiso88598e",
"csisolatinhebrew",
"hebrew",
"iso-8859-8",
"iso-8859-8-e",
"iso-ir-138",
"iso8859-8",
"iso88598",
"iso_8859-8",
"iso_8859-8:1988",
"visual"
],
"name": "iso-8859-8"
},
{
"labels": [
"csiso88598i",
"iso-8859-8-i",
"logical"
],
"name": "iso-8859-8-i"
},
{
"labels": [
"csisolatin6",
"iso-8859-10",
"iso-ir-157",
"iso8859-10",
"iso885910",
"l6",
"latin6"
],
"name": "iso-8859-10"
},
{
"labels": [
"iso-8859-13",
"iso8859-13",
"iso885913"
],
"name": "iso-8859-13"
},
{
"labels": [
"iso-8859-14",
"iso8859-14",
"iso885914"
],
"name": "iso-8859-14"
},
{
"labels": [
"csisolatin9",
"iso-8859-15",
"iso8859-15",
"iso885915",
"iso_8859-15",
"l9"
],
"name": "iso-8859-15"
},
{
"labels": [
"iso-8859-16"
],
"name": "iso-8859-16"
},
{
"labels": [
"cskoi8r",
"koi",
"koi8",
"koi8-r",
"koi8_r"
],
"name": "koi8-r"
},
{
"labels": [
"koi8-u"
],
"name": "koi8-u"
},
{
"labels": [
"csmacintosh",
"mac",
"macintosh",
"x-mac-roman"
],
"name": "macintosh"
},
{
"labels": [
"dos-874",
"iso-8859-11",
"iso8859-11",
"iso885911",
"tis-620",
"windows-874"
],
"name": "windows-874"
},
{
"labels": [
"cp1250",
"windows-1250",
"x-cp1250"
],
"name": "windows-1250"
},
{
"labels": [
"cp1251",
"windows-1251",
"x-cp1251"
],
"name": "windows-1251"
},
{
"labels": [
"ansi_x3.4-1968",
"ascii",
"cp1252",
"cp819",
"csisolatin1",
"ibm819",
"iso-8859-1",
"iso-ir-100",
"iso8859-1",
"iso88591",
"iso_8859-1",
"iso_8859-1:1987",
"l1",
"latin1",
"us-ascii",
"windows-1252",
"x-cp1252"
],
"name": "windows-1252"
},
{
"labels": [
"cp1253",
"windows-1253",
"x-cp1253"
],
"name": "windows-1253"
},
{
"labels": [
"cp1254",
"csisolatin5",
"iso-8859-9",
"iso-ir-148",
"iso8859-9",
"iso88599",
"iso_8859-9",
"iso_8859-9:1989",
"l5",
"latin5",
"windows-1254",
"x-cp1254"
],
"name": "windows-1254"
},
{
"labels": [
"cp1255",
"windows-1255",
"x-cp1255"
],
"name": "windows-1255"
},
{
"labels": [
"cp1256",
"windows-1256",
"x-cp1256"
],
"name": "windows-1256"
},
{
"labels": [
"cp1257",
"windows-1257",
"x-cp1257"
],
"name": "windows-1257"
},
{
"labels": [
"cp1258",
"windows-1258",
"x-cp1258"
],
"name": "windows-1258"
},
{
"labels": [
"x-mac-cyrillic",
"x-mac-ukrainian"
],
"name": "x-mac-cyrillic"
}
],
"heading": "Legacy single-byte encodings"
},
{
"encodings": [
{
"labels": [
"chinese",
"csgb2312",
"csiso58gb231280",
"gb18030",
"gb2312",
"gb_2312",
"gb_2312-80",
"gbk",
"iso-ir-58",
"x-gbk"
],
"name": "gb18030"
},
{
"labels": [
"hz-gb-2312"
],
"name": "hz-gb-2312"
}
],
"heading": "Legacy multi-byte Chinese (simplified) encodings"
},
{
"encodings": [
{
"labels": [
"big5",
"big5-hkscs",
"cn-big5",
"csbig5",
"x-x-big5"
],
"name": "big5"
}
],
"heading": "Legacy multi-byte Chinese (traditional) encodings"
},
{
"encodings": [
{
"labels": [
"cseucpkdfmtjapanese",
"euc-jp",
"x-euc-jp"
],
"name": "euc-jp"
},
{
"labels": [
"csiso2022jp",
"iso-2022-jp"
],
"name": "iso-2022-jp"
},
{
"labels": [
"csshiftjis",
"ms_kanji",
"shift-jis",
"shift_jis",
"sjis",
"windows-31j",
"x-sjis"
],
"name": "shift_jis"
}
],
"heading": "Legacy multi-byte Japanese encodings"
},
{
"encodings": [
{
"labels": [
"cseuckr",
"csksc56011987",
"euc-kr",
"iso-ir-149",
"korean",
"ks_c_5601-1987",
"ks_c_5601-1989",
"ksc5601",
"ksc_5601",
"windows-949"
],
"name": "euc-kr"
}
],
"heading": "Legacy multi-byte Korean encodings"
},
{
"encodings": [
{
"labels": [
"csiso2022kr",
"iso-2022-cn",
"iso-2022-cn-ext",
"iso-2022-kr"
],
"name": "replacement"
},
{
"labels": [
"utf-16be"
],
"name": "utf-16be"
},
{
"labels": [
"utf-16",
"utf-16le"
],
"name": "utf-16le"
},
{
"labels": [
"x-user-defined"
],
"name": "x-user-defined"
}
],
"heading": "Legacy miscellaneous encodings"
}
];
var name_to_encoding = {};
var label_to_encoding = {};
encodings.forEach(function(category) {
category['encodings'].forEach(function(encoding) {
name_to_encoding[encoding['name']] = encoding;
encoding['labels'].forEach(function(label) {
label_to_encoding[label] = encoding;
});
});
});
//
// 5. Indexes
//
/**
* @param {number} pointer The |pointer| to search for.
* @param {Array.<?number>|undefined} index The |index| to search within.
* @return {?number} The code point corresponding to |pointer| in |index|,
* or null if |code point| is not in |index|.
*/
function indexCodePointFor(pointer, index) {
if (!index) return null;
return index[pointer] || null;
}
/**
* @param {number} code_point The |code point| to search for.
* @param {Array.<?number>} index The |index| to search within.
* @return {?number} The first pointer corresponding to |code point| in
* |index|, or null if |code point| is not in |index|.
*/
function indexPointerFor(code_point, index) {
var pointer = index.indexOf(code_point);
return pointer === -1 ? null : pointer;
}
/**
* @param {string} name Name of the index.
* @return {(Array.<number>|Array.<Array.<number>>)}
* */
function index(name) {
if (!('encoding-indexes' in global))
throw new Error("Indexes missing. Did you forget to include encoding-indexes.js?");
return global['encoding-indexes'][name];
}
/**
* @param {number} pointer The |pointer| to search for in the gb18030 index.
* @return {?number} The code point corresponding to |pointer| in |index|,
* or null if |code point| is not in the gb18030 index.
*/
function indexGB18030CodePointFor(pointer) {
if ((pointer > 39419 && pointer < 189000) || (pointer > 1237575)) {
return null;
}
var /** @type {number} */ offset = 0,
/** @type {number} */ code_point_offset = 0,
/** @type {Array.<Array.<number>>} */ idx = index('gb18030');
var i;
for (i = 0; i < idx.length; ++i) {
var entry = idx[i];
if (entry[0] <= pointer) {
offset = entry[0];
code_point_offset = entry[1];
} else {
break;
}
}
return code_point_offset + pointer - offset;
}
/**
* @param {number} code_point The |code point| to locate in the gb18030 index.
* @return {number} The first pointer corresponding to |code point| in the
* gb18030 index.
*/
function indexGB18030PointerFor(code_point) {
var /** @type {number} */ offset = 0,
/** @type {number} */ pointer_offset = 0,
/** @type {Array.<Array.<number>>} */ idx = index('gb18030');
var i;
for (i = 0; i < idx.length; ++i) {
var entry = idx[i];
if (entry[1] <= code_point) {
offset = entry[1];
pointer_offset = entry[0];
} else {
break;
}
}
return pointer_offset + code_point - offset;
}
//
// 7. API
//
/** @const */ var DEFAULT_ENCODING = 'utf-8';
// 7.1 Interface TextDecoder
/**
* @constructor
* @param {string=} opt_encoding The label of the encoding;
* defaults to 'utf-8'.
* @param {Object=} options
*/
function TextDecoder(opt_encoding, options) {
if (!(this instanceof TextDecoder)) {
return new TextDecoder(opt_encoding, options);
}
opt_encoding = opt_encoding ? String(opt_encoding) : DEFAULT_ENCODING;
options = Object(options);
/** @private */
this._encoding = getEncoding(opt_encoding);
if (this._encoding === null || this._encoding.name === 'replacement')
throw new TypeError('Unknown encoding: ' + opt_encoding);
if (!this._encoding.getDecoder)
throw new Error('Decoder not present. Did you forget to include encoding-indexes.js?');
/** @private @type {boolean} */
this._streaming = false;
/** @private @type {boolean} */
this._BOMseen = false;
/** @private */
this._decoder = null;
/** @private @type {{fatal: boolean}} */
this._options = { fatal: Boolean(options['fatal']) };
if (Object.defineProperty) {
Object.defineProperty(
this, 'encoding',
{ get: function() { return this._encoding.name; } });
} else {
this.encoding = this._encoding.name;
}
return this;
}
// TODO: Issue if input byte stream is offset by decoder
// TODO: BOM detection will not work if stream header spans multiple calls
// (last N bytes of previous stream may need to be retained?)
TextDecoder.prototype = {
/**
* @param {ArrayBufferView=} opt_view The buffer of bytes to decode.
* @param {Object=} options
*/
decode: function decode(opt_view, options) {
if (opt_view && !('buffer' in opt_view && 'byteOffset' in opt_view &&
'byteLength' in opt_view)) {
throw new TypeError('Expected ArrayBufferView');
} else if (!opt_view) {
opt_view = new Uint8Array(0);
}
options = Object(options);
if (!this._streaming) {
this._decoder = this._encoding.getDecoder(this._options);
this._BOMseen = false;
}
this._streaming = Boolean(options['stream']);
var bytes = new Uint8Array(opt_view.buffer,
opt_view.byteOffset,
opt_view.byteLength);
var input_stream = new ByteInputStream(bytes);
var output_stream = new CodePointOutputStream();
/** @type {number} */
var code_point;
while (input_stream.get() !== EOF_byte) {
code_point = this._decoder.decode(input_stream);
if (code_point !== null && code_point !== EOF_code_point) {
output_stream.emit(code_point);
}
}
if (!this._streaming) {
do {
code_point = this._decoder.decode(input_stream);
if (code_point !== null && code_point !== EOF_code_point) {
output_stream.emit(code_point);
}
} while (code_point !== EOF_code_point &&
input_stream.get() != EOF_byte);
this._decoder = null;
}
var result = output_stream.string();
if (!this._BOMseen && result.length) {
this._BOMseen = true;
if (['utf-8', 'utf-16le', 'utf-16be'].indexOf(this.encoding) !== -1 &&
result.charCodeAt(0) === 0xFEFF) {
result = result.substring(1);
}
}
return result;
}
};
// 7.2 Interface TextEncoder
/**
* @constructor
* @param {string=} opt_encoding The label of the encoding;
* defaults to 'utf-8'.
* @param {{fatal: boolean}=} options
*/
function TextEncoder(opt_encoding, options) {
if (!(this instanceof TextEncoder)) {
return new TextEncoder(opt_encoding, options);
}
opt_encoding = opt_encoding ? String(opt_encoding) : DEFAULT_ENCODING;
options = Object(options);
/** @private */
this._encoding = getEncoding(opt_encoding);
var allowLegacyEncoding = options.NONSTANDARD_allowLegacyEncoding;
var isLegacyEncoding = (this._encoding.name !== 'utf-8' &&
this._encoding.name !== 'utf-16le' &&
this._encoding.name !== 'utf-16be');
if (this._encoding === null || (isLegacyEncoding && !allowLegacyEncoding))
throw new TypeError('Unknown encoding: ' + opt_encoding);
if (!this._encoding.getEncoder)
throw new Error('Encoder not present. Did you forget to include encoding-indexes.js?');
/** @private @type {boolean} */
this._streaming = false;
/** @private */
this._encoder = null;
/** @private @type {{fatal: boolean}} */
this._options = { fatal: Boolean(options['fatal']) };
if (Object.defineProperty) {
Object.defineProperty(
this, 'encoding',
{ get: function() { return this._encoding.name; } });
} else {
this.encoding = this._encoding.name;
}
return this;
}
TextEncoder.prototype = {
/**
* @param {string=} opt_string The string to encode.
* @param {{stream: boolean}=} options
*/
encode: function encode(opt_string, options) {
opt_string = opt_string ? String(opt_string) : '';
options = Object(options);
// TODO: any options?
if (!this._streaming) {
this._encoder = this._encoding.getEncoder(this._options);
}
this._streaming = Boolean(options['stream']);
var bytes = [];
var output_stream = new ByteOutputStream(bytes);
var input_stream = new CodePointInputStream(opt_string);
while (input_stream.get() !== EOF_code_point) {
this._encoder.encode(output_stream, input_stream);
}
if (!this._streaming) {
/** @type {number} */
var last_byte;
do {
last_byte = this._encoder.encode(output_stream, input_stream);
} while (last_byte !== EOF_byte);
this._encoder = null;
}
return new Uint8Array(bytes);
}
};
//
// 8. The encoding
//
// 8.1 utf-8
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function UTF8Decoder(options) {
var fatal = options.fatal;
var /** @type {number} */ utf8_code_point = 0,
/** @type {number} */ utf8_bytes_needed = 0,
/** @type {number} */ utf8_bytes_seen = 0,
/** @type {number} */ utf8_lower_boundary = 0;
/**
* @param {ByteInputStream} byte_pointer The byte stream to decode.
* @return {?number} The next code point decoded, or null if not enough
* data exists in the input stream to decode a complete code point.
*/
this.decode = function(byte_pointer) {
var bite = byte_pointer.get();
if (bite === EOF_byte) {
if (utf8_bytes_needed !== 0) {
return decoderError(fatal);
}
return EOF_code_point;
}
byte_pointer.offset(1);
if (utf8_bytes_needed === 0) {
if (inRange(bite, 0x00, 0x7F)) {
return bite;
}
if (inRange(bite, 0xC2, 0xDF)) {
utf8_bytes_needed = 1;
utf8_lower_boundary = 0x80;
utf8_code_point = bite - 0xC0;
} else if (inRange(bite, 0xE0, 0xEF)) {
utf8_bytes_needed = 2;
utf8_lower_boundary = 0x800;
utf8_code_point = bite - 0xE0;
} else if (inRange(bite, 0xF0, 0xF4)) {
utf8_bytes_needed = 3;
utf8_lower_boundary = 0x10000;
utf8_code_point = bite - 0xF0;
} else {
return decoderError(fatal);
}
utf8_code_point = utf8_code_point * Math.pow(64, utf8_bytes_needed);
return null;
}
if (!inRange(bite, 0x80, 0xBF)) {
utf8_code_point = 0;
utf8_bytes_needed = 0;
utf8_bytes_seen = 0;
utf8_lower_boundary = 0;
byte_pointer.offset(-1);
return decoderError(fatal);
}
utf8_bytes_seen += 1;
utf8_code_point = utf8_code_point + (bite - 0x80) *
Math.pow(64, utf8_bytes_needed - utf8_bytes_seen);
if (utf8_bytes_seen !== utf8_bytes_needed) {
return null;
}
var code_point = utf8_code_point;
var lower_boundary = utf8_lower_boundary;
utf8_code_point = 0;
utf8_bytes_needed = 0;
utf8_bytes_seen = 0;
utf8_lower_boundary = 0;
if (inRange(code_point, lower_boundary, 0x10FFFF) &&
!inRange(code_point, 0xD800, 0xDFFF)) {
return code_point;
}
return decoderError(fatal);
};
}
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function UTF8Encoder(options) {
var fatal = options.fatal;
/**
* @param {ByteOutputStream} output_byte_stream Output byte stream.
* @param {CodePointInputStream} code_point_pointer Input stream.
* @return {number} The last byte emitted.
*/
this.encode = function(output_byte_stream, code_point_pointer) {
/** @type {number} */
var code_point = code_point_pointer.get();
if (code_point === EOF_code_point) {
return EOF_byte;
}
code_point_pointer.offset(1);
if (inRange(code_point, 0xD800, 0xDFFF)) {
return encoderError(code_point);
}
if (inRange(code_point, 0x0000, 0x007f)) {
return output_byte_stream.emit(code_point);
}
var count, offset;
if (inRange(code_point, 0x0080, 0x07FF)) {
count = 1;
offset = 0xC0;
} else if (inRange(code_point, 0x0800, 0xFFFF)) {
count = 2;
offset = 0xE0;
} else if (inRange(code_point, 0x10000, 0x10FFFF)) {
count = 3;
offset = 0xF0;
}
var result = output_byte_stream.emit(
div(code_point, Math.pow(64, count)) + offset);
while (count > 0) {
var temp = div(code_point, Math.pow(64, count - 1));
result = output_byte_stream.emit(0x80 + (temp % 64));
count -= 1;
}
return result;
};
}
/** @param {{fatal: boolean}} options */
name_to_encoding['utf-8'].getEncoder = function(options) {
return new UTF8Encoder(options);
};
/** @param {{fatal: boolean}} options */
name_to_encoding['utf-8'].getDecoder = function(options) {
return new UTF8Decoder(options);
};
//
// 9. Legacy single-byte encodings
//
/**
* @constructor
* @param {Array.<number>} index The encoding index.
* @param {{fatal: boolean}} options
*/
function SingleByteDecoder(index, options) {
var fatal = options.fatal;
/**
* @param {ByteInputStream} byte_pointer The byte stream to decode.
* @return {?number} The next code point decoded, or null if not enough
* data exists in the input stream to decode a complete code point.
*/
this.decode = function(byte_pointer) {
var bite = byte_pointer.get();
if (bite === EOF_byte) {
return EOF_code_point;
}
byte_pointer.offset(1);
if (inRange(bite, 0x00, 0x7F)) {
return bite;
}
var code_point = index[bite - 0x80];
if (code_point === null) {
return decoderError(fatal);
}
return code_point;
};
}
/**
* @constructor
* @param {Array.<?number>} index The encoding index.
* @param {{fatal: boolean}} options
*/
function SingleByteEncoder(index, options) {
var fatal = options.fatal;
/**
* @param {ByteOutputStream} output_byte_stream Output byte stream.
* @param {CodePointInputStream} code_point_pointer Input stream.
* @return {number} The last byte emitted.
*/
this.encode = function(output_byte_stream, code_point_pointer) {
var code_point = code_point_pointer.get();
if (code_point === EOF_code_point) {
return EOF_byte;
}
code_point_pointer.offset(1);
if (inRange(code_point, 0x0000, 0x007F)) {
return output_byte_stream.emit(code_point);
}
var pointer = indexPointerFor(code_point, index);
if (pointer === null) {
encoderError(code_point);
}
return output_byte_stream.emit(pointer + 0x80);
};
}
(function() {
if (!('encoding-indexes' in global))
return;
encodings.forEach(function(category) {
if (category['heading'] !== 'Legacy single-byte encodings')
return;
category['encodings'].forEach(function(encoding) {
var idx = index(encoding['name']);
/** @param {{fatal: boolean}} options */
encoding.getDecoder = function(options) {
return new SingleByteDecoder(idx, options);
};
/** @param {{fatal: boolean}} options */
encoding.getEncoder = function(options) {
return new SingleByteEncoder(idx, options);
};
});
});
}());
//
// 10. Legacy multi-byte Chinese (simplified) encodings
//
// 9.1 gb18030
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function GB18030Decoder(options) {
var fatal = options.fatal;
var /** @type {number} */ gb18030_first = 0x00,
/** @type {number} */ gb18030_second = 0x00,
/** @type {number} */ gb18030_third = 0x00;
/**
* @param {ByteInputStream} byte_pointer The byte stream to decode.
* @return {?number} The next code point decoded, or null if not enough
* data exists in the input stream to decode a complete code point.
*/
this.decode = function(byte_pointer) {
var bite = byte_pointer.get();
if (bite === EOF_byte && gb18030_first === 0x00 &&
gb18030_second === 0x00 && gb18030_third === 0x00) {
return EOF_code_point;
}
if (bite === EOF_byte &&
(gb18030_first !== 0x00 || gb18030_second !== 0x00 || gb18030_third !== 0x00)) {
gb18030_first = 0x00;
gb18030_second = 0x00;
gb18030_third = 0x00;
decoderError(fatal);
}
byte_pointer.offset(1);
var code_point;
if (gb18030_third !== 0x00) {
code_point = null;
if (inRange(bite, 0x30, 0x39)) {
code_point = indexGB18030CodePointFor(
(((gb18030_first - 0x81) * 10 + (gb18030_second - 0x30)) * 126 +
(gb18030_third - 0x81)) * 10 + bite - 0x30);
}
gb18030_first = 0x00;
gb18030_second = 0x00;
gb18030_third = 0x00;
if (code_point === null) {
byte_pointer.offset(-3);
return decoderError(fatal);
}
return code_point;
}
if (gb18030_second !== 0x00) {
if (inRange(bite, 0x81, 0xFE)) {
gb18030_third = bite;
return null;
}
byte_pointer.offset(-2);
gb18030_first = 0x00;
gb18030_second = 0x00;
return decoderError(fatal);
}
if (gb18030_first !== 0x00) {
if (inRange(bite, 0x30, 0x39)) {
gb18030_second = bite;
return null;
}
var lead = gb18030_first;
var pointer = null;
gb18030_first = 0x00;
var offset = bite < 0x7F ? 0x40 : 0x41;
if (inRange(bite, 0x40, 0x7E) || inRange(bite, 0x80, 0xFE)) {
pointer = (lead - 0x81) * 190 + (bite - offset);
}
code_point = pointer === null ? null :
indexCodePointFor(pointer, index('gb18030'));
if (pointer === null) {
byte_pointer.offset(-1);
}
if (code_point === null) {
return decoderError(fatal);
}
return code_point;
}
if (inRange(bite, 0x00, 0x7F)) {
return bite;
}
if (bite === 0x80) {
return 0x20AC;
}
if (inRange(bite, 0x81, 0xFE)) {
gb18030_first = bite;
return null;
}
return decoderError(fatal);
};
}
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function GB18030Encoder(options) {
var fatal = options.fatal;
/**
* @param {ByteOutputStream} output_byte_stream Output byte stream.
* @param {CodePointInputStream} code_point_pointer Input stream.
* @return {number} The last byte emitted.
*/
this.encode = function(output_byte_stream, code_point_pointer) {
var code_point = code_point_pointer.get();
if (code_point === EOF_code_point) {
return EOF_byte;
}
code_point_pointer.offset(1);
if (inRange(code_point, 0x0000, 0x007F)) {
return output_byte_stream.emit(code_point);
}
var pointer = indexPointerFor(code_point, index('gb18030'));
if (pointer !== null) {
var lead = div(pointer, 190) + 0x81;
var trail = pointer % 190;
var offset = trail < 0x3F ? 0x40 : 0x41;
return output_byte_stream.emit(lead, trail + offset);
}
pointer = indexGB18030PointerFor(code_point);
var byte1 = div(div(div(pointer, 10), 126), 10);
pointer = pointer - byte1 * 10 * 126 * 10;
var byte2 = div(div(pointer, 10), 126);
pointer = pointer - byte2 * 10 * 126;
var byte3 = div(pointer, 10);
var byte4 = pointer - byte3 * 10;
return output_byte_stream.emit(byte1 + 0x81,
byte2 + 0x30,
byte3 + 0x81,
byte4 + 0x30);
};
}
/** @param {{fatal: boolean}} options */
name_to_encoding['gb18030'].getEncoder = function(options) {
return new GB18030Encoder(options);
};
/** @param {{fatal: boolean}} options */
name_to_encoding['gb18030'].getDecoder = function(options) {
return new GB18030Decoder(options);
};
// 10.2 hz-gb-2312
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function HZGB2312Decoder(options) {
var fatal = options.fatal;
var /** @type {boolean} */ hzgb2312 = false,
/** @type {number} */ hzgb2312_lead = 0x00;
/**
* @param {ByteInputStream} byte_pointer The byte stream to decode.
* @return {?number} The next code point decoded, or null if not enough
* data exists in the input stream to decode a complete code point.
*/
this.decode = function(byte_pointer) {
var bite = byte_pointer.get();
if (bite === EOF_byte && hzgb2312_lead === 0x00) {
return EOF_code_point;
}
if (bite === EOF_byte && hzgb2312_lead !== 0x00) {
hzgb2312_lead = 0x00;
return decoderError(fatal);
}
byte_pointer.offset(1);
if (hzgb2312_lead === 0x7E) {
hzgb2312_lead = 0x00;
if (bite === 0x7B) {
hzgb2312 = true;
return null;
}
if (bite === 0x7D) {
hzgb2312 = false;
return null;
}
if (bite === 0x7E) {
return 0x007E;
}
if (bite === 0x0A) {
return null;
}
byte_pointer.offset(-1);
return decoderError(fatal);
}
if (hzgb2312_lead !== 0x00) {
var lead = hzgb2312_lead;
hzgb2312_lead = 0x00;
var code_point = null;
if (inRange(bite, 0x21, 0x7E)) {
code_point = indexCodePointFor((lead - 1) * 190 +
(bite + 0x3F), index('gb18030'));
}
if (bite === 0x0A) {
hzgb2312 = false;
}
if (code_point === null) {
return decoderError(fatal);
}
return code_point;
}
if (bite === 0x7E) {
hzgb2312_lead = 0x7E;
return null;
}
if (hzgb2312) {
if (inRange(bite, 0x20, 0x7F)) {
hzgb2312_lead = bite;
return null;
}
if (bite === 0x0A) {
hzgb2312 = false;
}
return decoderError(fatal);
}
if (inRange(bite, 0x00, 0x7F)) {
return bite;
}
return decoderError(fatal);
};
}
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function HZGB2312Encoder(options) {
var fatal = options.fatal;
/** @type {boolean} */
var hzgb2312 = false;
/**
* @param {ByteOutputStream} output_byte_stream Output byte stream.
* @param {CodePointInputStream} code_point_pointer Input stream.
* @return {number} The last byte emitted.
*/
this.encode = function(output_byte_stream, code_point_pointer) {
var code_point = code_point_pointer.get();
if (code_point === EOF_code_point) {
return EOF_byte;
}
code_point_pointer.offset(1);
if (inRange(code_point, 0x0000, 0x007F) && hzgb2312) {
code_point_pointer.offset(-1);
hzgb2312 = false;
return output_byte_stream.emit(0x7E, 0x7D);
}
if (code_point === 0x007E) {
return output_byte_stream.emit(0x7E, 0x7E);
}
if (inRange(code_point, 0x0000, 0x007F)) {
return output_byte_stream.emit(code_point);
}
if (!hzgb2312) {
code_point_pointer.offset(-1);
hzgb2312 = true;
return output_byte_stream.emit(0x7E, 0x7B);
}
var pointer = indexPointerFor(code_point, index('gb18030'));
if (pointer === null) {
return encoderError(code_point);
}
var lead = div(pointer, 190) + 1;
var trail = pointer % 190 - 0x3F;
if (!inRange(lead, 0x21, 0x7E) || !inRange(trail, 0x21, 0x7E)) {
return encoderError(code_point);
}
return output_byte_stream.emit(lead, trail);
};
}
/** @param {{fatal: boolean}} options */
name_to_encoding['hz-gb-2312'].getEncoder = function(options) {
return new HZGB2312Encoder(options);
};
/** @param {{fatal: boolean}} options */
name_to_encoding['hz-gb-2312'].getDecoder = function(options) {
return new HZGB2312Decoder(options);
};
//
// 11. Legacy multi-byte Chinese (traditional) encodings
//
// 11.1 big5
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function Big5Decoder(options) {
var fatal = options.fatal;
var /** @type {number} */ big5_lead = 0x00,
/** @type {?number} */ big5_pending = null;
/**
* @param {ByteInputStream} byte_pointer The byte steram to decode.
* @return {?number} The next code point decoded, or null if not enough
* data exists in the input stream to decode a complete code point.
*/
this.decode = function(byte_pointer) {
// NOTE: Hack to support emitting two code points
if (big5_pending !== null) {
var pending = big5_pending;
big5_pending = null;
return pending;
}
var bite = byte_pointer.get();
if (bite === EOF_byte && big5_lead === 0x00) {
return EOF_code_point;
}
if (bite === EOF_byte && big5_lead !== 0x00) {
big5_lead = 0x00;
return decoderError(fatal);
}
byte_pointer.offset(1);
if (big5_lead !== 0x00) {
var lead = big5_lead;
var pointer = null;
big5_lead = 0x00;
var offset = bite < 0x7F ? 0x40 : 0x62;
if (inRange(bite, 0x40, 0x7E) || inRange(bite, 0xA1, 0xFE)) {
pointer = (lead - 0x81) * 157 + (bite - offset);
}
if (pointer === 1133) {
big5_pending = 0x0304;
return 0x00CA;
}
if (pointer === 1135) {
big5_pending = 0x030C;
return 0x00CA;
}
if (pointer === 1164) {
big5_pending = 0x0304;
return 0x00EA;
}
if (pointer === 1166) {
big5_pending = 0x030C;
return 0x00EA;
}
var code_point = (pointer === null) ? null :
indexCodePointFor(pointer, index('big5'));
if (pointer === null) {
byte_pointer.offset(-1);
}
if (code_point === null) {
return decoderError(fatal);
}
return code_point;
}
if (inRange(bite, 0x00, 0x7F)) {
return bite;
}
if (inRange(bite, 0x81, 0xFE)) {
big5_lead = bite;
return null;
}
return decoderError(fatal);
};
}
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function Big5Encoder(options) {
var fatal = options.fatal;
/**
* @param {ByteOutputStream} output_byte_stream Output byte stream.
* @param {CodePointInputStream} code_point_pointer Input stream.
* @return {number} The last byte emitted.
*/
this.encode = function(output_byte_stream, code_point_pointer) {
var code_point = code_point_pointer.get();
if (code_point === EOF_code_point) {
return EOF_byte;
}
code_point_pointer.offset(1);
if (inRange(code_point, 0x0000, 0x007F)) {
return output_byte_stream.emit(code_point);
}
var pointer = indexPointerFor(code_point, index('big5'));
if (pointer === null) {
return encoderError(code_point);
}
var lead = div(pointer, 157) + 0x81;
//if (lead < 0xA1) {
// return encoderError(code_point);
//}
var trail = pointer % 157;
var offset = trail < 0x3F ? 0x40 : 0x62;
return output_byte_stream.emit(lead, trail + offset);
};
}
/** @param {{fatal: boolean}} options */
name_to_encoding['big5'].getEncoder = function(options) {
return new Big5Encoder(options);
};
/** @param {{fatal: boolean}} options */
name_to_encoding['big5'].getDecoder = function(options) {
return new Big5Decoder(options);
};
//
// 12. Legacy multi-byte Japanese encodings
//
// 12.1 euc.jp
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function EUCJPDecoder(options) {
var fatal = options.fatal;
var /** @type {number} */ eucjp_first = 0x00,
/** @type {number} */ eucjp_second = 0x00;
/**
* @param {ByteInputStream} byte_pointer The byte stream to decode.
* @return {?number} The next code point decoded, or null if not enough
* data exists in the input stream to decode a complete code point.
*/
this.decode = function(byte_pointer) {
var bite = byte_pointer.get();
if (bite === EOF_byte) {
if (eucjp_first === 0x00 && eucjp_second === 0x00) {
return EOF_code_point;
}
eucjp_first = 0x00;
eucjp_second = 0x00;
return decoderError(fatal);
}
byte_pointer.offset(1);
var lead, code_point;
if (eucjp_second !== 0x00) {
lead = eucjp_second;
eucjp_second = 0x00;
code_point = null;
if (inRange(lead, 0xA1, 0xFE) && inRange(bite, 0xA1, 0xFE)) {
code_point = indexCodePointFor((lead - 0xA1) * 94 + bite - 0xA1,
index('jis0212'));
}
if (!inRange(bite, 0xA1, 0xFE)) {
byte_pointer.offset(-1);
}
if (code_point === null) {
return decoderError(fatal);
}
return code_point;
}
if (eucjp_first === 0x8E && inRange(bite, 0xA1, 0xDF)) {
eucjp_first = 0x00;
return 0xFF61 + bite - 0xA1;
}
if (eucjp_first === 0x8F && inRange(bite, 0xA1, 0xFE)) {
eucjp_first = 0x00;
eucjp_second = bite;
return null;
}
if (eucjp_first !== 0x00) {
lead = eucjp_first;
eucjp_first = 0x00;
code_point = null;
if (inRange(lead, 0xA1, 0xFE) && inRange(bite, 0xA1, 0xFE)) {
code_point = indexCodePointFor((lead - 0xA1) * 94 + bite - 0xA1,
index('jis0208'));
}
if (!inRange(bite, 0xA1, 0xFE)) {
byte_pointer.offset(-1);
}
if (code_point === null) {
return decoderError(fatal);
}
return code_point;
}
if (inRange(bite, 0x00, 0x7F)) {
return bite;
}
if (bite === 0x8E || bite === 0x8F || (inRange(bite, 0xA1, 0xFE))) {
eucjp_first = bite;
return null;
}
return decoderError(fatal);
};
}
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function EUCJPEncoder(options) {
var fatal = options.fatal;
/**
* @param {ByteOutputStream} output_byte_stream Output byte stream.
* @param {CodePointInputStream} code_point_pointer Input stream.
* @return {number} The last byte emitted.
*/
this.encode = function(output_byte_stream, code_point_pointer) {
var code_point = code_point_pointer.get();
if (code_point === EOF_code_point) {
return EOF_byte;
}
code_point_pointer.offset(1);
if (inRange(code_point, 0x0000, 0x007F)) {
return output_byte_stream.emit(code_point);
}
if (code_point === 0x00A5) {
return output_byte_stream.emit(0x5C);
}
if (code_point === 0x203E) {
return output_byte_stream.emit(0x7E);
}
if (inRange(code_point, 0xFF61, 0xFF9F)) {
return output_byte_stream.emit(0x8E, code_point - 0xFF61 + 0xA1);
}
var pointer = indexPointerFor(code_point, index('jis0208'));
if (pointer === null) {
return encoderError(code_point);
}
var lead = div(pointer, 94) + 0xA1;
var trail = pointer % 94 + 0xA1;
return output_byte_stream.emit(lead, trail);
};
}
/** @param {{fatal: boolean}} options */
name_to_encoding['euc-jp'].getEncoder = function(options) {
return new EUCJPEncoder(options);
};
/** @param {{fatal: boolean}} options */
name_to_encoding['euc-jp'].getDecoder = function(options) {
return new EUCJPDecoder(options);
};
// 12.2 iso-2022-jp
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function ISO2022JPDecoder(options) {
var fatal = options.fatal;
/** @enum */
var state = {
ASCII: 0,
escape_start: 1,
escape_middle: 2,
escape_final: 3,
lead: 4,
trail: 5,
Katakana: 6
};
var /** @type {number} */ iso2022jp_state = state.ASCII,
/** @type {boolean} */ iso2022jp_jis0212 = false,
/** @type {number} */ iso2022jp_lead = 0x00;
/**
* @param {ByteInputStream} byte_pointer The byte stream to decode.
* @return {?number} The next code point decoded, or null if not enough
* data exists in the input stream to decode a complete code point.
*/
this.decode = function(byte_pointer) {
var bite = byte_pointer.get();
if (bite !== EOF_byte) {
byte_pointer.offset(1);
}
switch (iso2022jp_state) {
default:
case state.ASCII:
if (bite === 0x1B) {
iso2022jp_state = state.escape_start;
return null;
}
if (inRange(bite, 0x00, 0x7F)) {
return bite;
}
if (bite === EOF_byte) {
return EOF_code_point;
}
return decoderError(fatal);
case state.escape_start:
if (bite === 0x24 || bite === 0x28) {
iso2022jp_lead = bite;
iso2022jp_state = state.escape_middle;
return null;
}
if (bite !== EOF_byte) {
byte_pointer.offset(-1);
}
iso2022jp_state = state.ASCII;
return decoderError(fatal);
case state.escape_middle:
var lead = iso2022jp_lead;
iso2022jp_lead = 0x00;
if (lead === 0x24 && (bite === 0x40 || bite === 0x42)) {
iso2022jp_jis0212 = false;
iso2022jp_state = state.lead;
return null;
}
if (lead === 0x24 && bite === 0x28) {
iso2022jp_state = state.escape_final;
return null;
}
if (lead === 0x28 && (bite === 0x42 || bite === 0x4A)) {
iso2022jp_state = state.ASCII;
return null;
}
if (lead === 0x28 && bite === 0x49) {
iso2022jp_state = state.Katakana;
return null;
}
if (bite === EOF_byte) {
byte_pointer.offset(-1);
} else {
byte_pointer.offset(-2);
}
iso2022jp_state = state.ASCII;
return decoderError(fatal);
case state.escape_final:
if (bite === 0x44) {
iso2022jp_jis0212 = true;
iso2022jp_state = state.lead;
return null;
}
if (bite === EOF_byte) {
byte_pointer.offset(-2);
} else {
byte_pointer.offset(-3);
}
iso2022jp_state = state.ASCII;
return decoderError(fatal);
case state.lead:
if (bite === 0x0A) {
iso2022jp_state = state.ASCII;
return decoderError(fatal, 0x000A);
}
if (bite === 0x1B) {
iso2022jp_state = state.escape_start;
return null;
}
if (bite === EOF_byte) {
return EOF_code_point;
}
iso2022jp_lead = bite;
iso2022jp_state = state.trail;
return null;
case state.trail:
iso2022jp_state = state.lead;
if (bite === EOF_byte) {
return decoderError(fatal);
}
var code_point = null;
var pointer = (iso2022jp_lead - 0x21) * 94 + bite - 0x21;
if (inRange(iso2022jp_lead, 0x21, 0x7E) &&
inRange(bite, 0x21, 0x7E)) {
code_point = (iso2022jp_jis0212 === false) ?
indexCodePointFor(pointer, index('jis0208')) :
indexCodePointFor(pointer, index('jis0212'));
}
if (code_point === null) {
return decoderError(fatal);
}
return code_point;
case state.Katakana:
if (bite === 0x1B) {
iso2022jp_state = state.escape_start;
return null;
}
if (inRange(bite, 0x21, 0x5F)) {
return 0xFF61 + bite - 0x21;
}
if (bite === EOF_byte) {
return EOF_code_point;
}
return decoderError(fatal);
}
};
}
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function ISO2022JPEncoder(options) {
var fatal = options.fatal;
/** @enum */
var state = {
ASCII: 0,
lead: 1,
Katakana: 2
};
var /** @type {number} */ iso2022jp_state = state.ASCII;
/**
* @param {ByteOutputStream} output_byte_stream Output byte stream.
* @param {CodePointInputStream} code_point_pointer Input stream.
* @return {number} The last byte emitted.
*/
this.encode = function(output_byte_stream, code_point_pointer) {
var code_point = code_point_pointer.get();
if (code_point === EOF_code_point) {
return EOF_byte;
}
code_point_pointer.offset(1);
if ((inRange(code_point, 0x0000, 0x007F) ||
code_point === 0x00A5 || code_point === 0x203E) &&
iso2022jp_state !== state.ASCII) {
code_point_pointer.offset(-1);
iso2022jp_state = state.ASCII;
return output_byte_stream.emit(0x1B, 0x28, 0x42);
}
if (inRange(code_point, 0x0000, 0x007F)) {
return output_byte_stream.emit(code_point);
}
if (code_point === 0x00A5) {
return output_byte_stream.emit(0x5C);
}
if (code_point === 0x203E) {
return output_byte_stream.emit(0x7E);
}
if (inRange(code_point, 0xFF61, 0xFF9F) &&
iso2022jp_state !== state.Katakana) {
code_point_pointer.offset(-1);
iso2022jp_state = state.Katakana;
return output_byte_stream.emit(0x1B, 0x28, 0x49);
}
if (inRange(code_point, 0xFF61, 0xFF9F)) {
return output_byte_stream.emit(code_point - 0xFF61 - 0x21);
}
if (iso2022jp_state !== state.lead) {
code_point_pointer.offset(-1);
iso2022jp_state = state.lead;
return output_byte_stream.emit(0x1B, 0x24, 0x42);
}
var pointer = indexPointerFor(code_point, index('jis0208'));
if (pointer === null) {
return encoderError(code_point);
}
var lead = div(pointer, 94) + 0x21;
var trail = pointer % 94 + 0x21;
return output_byte_stream.emit(lead, trail);
};
}
/** @param {{fatal: boolean}} options */
name_to_encoding['iso-2022-jp'].getEncoder = function(options) {
return new ISO2022JPEncoder(options);
};
/** @param {{fatal: boolean}} options */
name_to_encoding['iso-2022-jp'].getDecoder = function(options) {
return new ISO2022JPDecoder(options);
};
// 12.3 shift_jis
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function ShiftJISDecoder(options) {
var fatal = options.fatal;
var /** @type {number} */ shiftjis_lead = 0x00;
/**
* @param {ByteInputStream} byte_pointer The byte stream to decode.
* @return {?number} The next code point decoded, or null if not enough
* data exists in the input stream to decode a complete code point.
*/
this.decode = function(byte_pointer) {
var bite = byte_pointer.get();
if (bite === EOF_byte && shiftjis_lead === 0x00) {
return EOF_code_point;
}
if (bite === EOF_byte && shiftjis_lead !== 0x00) {
shiftjis_lead = 0x00;
return decoderError(fatal);
}
byte_pointer.offset(1);
if (shiftjis_lead !== 0x00) {
var lead = shiftjis_lead;
shiftjis_lead = 0x00;
if (inRange(bite, 0x40, 0x7E) || inRange(bite, 0x80, 0xFC)) {
var offset = (bite < 0x7F) ? 0x40 : 0x41;
var lead_offset = (lead < 0xA0) ? 0x81 : 0xC1;
var code_point = indexCodePointFor((lead - lead_offset) * 188 +
bite - offset, index('jis0208'));
if (code_point === null) {
return decoderError(fatal);
}
return code_point;
}
byte_pointer.offset(-1);
return decoderError(fatal);
}
if (inRange(bite, 0x00, 0x80)) {
return bite;
}
if (inRange(bite, 0xA1, 0xDF)) {
return 0xFF61 + bite - 0xA1;
}
if (inRange(bite, 0x81, 0x9F) || inRange(bite, 0xE0, 0xFC)) {
shiftjis_lead = bite;
return null;
}
return decoderError(fatal);
};
}
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function ShiftJISEncoder(options) {
var fatal = options.fatal;
/**
* @param {ByteOutputStream} output_byte_stream Output byte stream.
* @param {CodePointInputStream} code_point_pointer Input stream.
* @return {number} The last byte emitted.
*/
this.encode = function(output_byte_stream, code_point_pointer) {
var code_point = code_point_pointer.get();
if (code_point === EOF_code_point) {
return EOF_byte;
}
code_point_pointer.offset(1);
if (inRange(code_point, 0x0000, 0x0080)) {
return output_byte_stream.emit(code_point);
}
if (code_point === 0x00A5) {
return output_byte_stream.emit(0x5C);
}
if (code_point === 0x203E) {
return output_byte_stream.emit(0x7E);
}
if (inRange(code_point, 0xFF61, 0xFF9F)) {
return output_byte_stream.emit(code_point - 0xFF61 + 0xA1);
}
var pointer = indexPointerFor(code_point, index('jis0208'));
if (pointer === null) {
return encoderError(code_point);
}
var lead = div(pointer, 188);
var lead_offset = lead < 0x1F ? 0x81 : 0xC1;
var trail = pointer % 188;
var offset = trail < 0x3F ? 0x40 : 0x41;
return output_byte_stream.emit(lead + lead_offset, trail + offset);
};
}
/** @param {{fatal: boolean}} options */
name_to_encoding['shift_jis'].getEncoder = function(options) {
return new ShiftJISEncoder(options);
};
/** @param {{fatal: boolean}} options */
name_to_encoding['shift_jis'].getDecoder = function(options) {
return new ShiftJISDecoder(options);
};
//
// 13. Legacy multi-byte Korean encodings
//
// 13.1 euc-kr
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function EUCKRDecoder(options) {
var fatal = options.fatal;
var /** @type {number} */ euckr_lead = 0x00;
/**
* @param {ByteInputStream} byte_pointer The byte stream to decode.
* @return {?number} The next code point decoded, or null if not enough
* data exists in the input stream to decode a complete code point.
*/
this.decode = function(byte_pointer) {
var bite = byte_pointer.get();
if (bite === EOF_byte && euckr_lead === 0) {
return EOF_code_point;
}
if (bite === EOF_byte && euckr_lead !== 0) {
euckr_lead = 0x00;
return decoderError(fatal);
}
byte_pointer.offset(1);
if (euckr_lead !== 0x00) {
var lead = euckr_lead;
var pointer = null;
euckr_lead = 0x00;
if (inRange(lead, 0x81, 0xC6)) {
var temp = (26 + 26 + 126) * (lead - 0x81);
if (inRange(bite, 0x41, 0x5A)) {
pointer = temp + bite - 0x41;
} else if (inRange(bite, 0x61, 0x7A)) {
pointer = temp + 26 + bite - 0x61;
} else if (inRange(bite, 0x81, 0xFE)) {
pointer = temp + 26 + 26 + bite - 0x81;
}
}
if (inRange(lead, 0xC7, 0xFD) && inRange(bite, 0xA1, 0xFE)) {
pointer = (26 + 26 + 126) * (0xC7 - 0x81) + (lead - 0xC7) * 94 +
(bite - 0xA1);
}
var code_point = (pointer === null) ? null :
indexCodePointFor(pointer, index('euc-kr'));
if (pointer === null) {
byte_pointer.offset(-1);
}
if (code_point === null) {
return decoderError(fatal);
}
return code_point;
}
if (inRange(bite, 0x00, 0x7F)) {
return bite;
}
if (inRange(bite, 0x81, 0xFD)) {
euckr_lead = bite;
return null;
}
return decoderError(fatal);
};
}
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function EUCKREncoder(options) {
var fatal = options.fatal;
/**
* @param {ByteOutputStream} output_byte_stream Output byte stream.
* @param {CodePointInputStream} code_point_pointer Input stream.
* @return {number} The last byte emitted.
*/
this.encode = function(output_byte_stream, code_point_pointer) {
var code_point = code_point_pointer.get();
if (code_point === EOF_code_point) {
return EOF_byte;
}
code_point_pointer.offset(1);
if (inRange(code_point, 0x0000, 0x007F)) {
return output_byte_stream.emit(code_point);
}
var pointer = indexPointerFor(code_point, index('euc-kr'));
if (pointer === null) {
return encoderError(code_point);
}
var lead, trail;
if (pointer < ((26 + 26 + 126) * (0xC7 - 0x81))) {
lead = div(pointer, (26 + 26 + 126)) + 0x81;
trail = pointer % (26 + 26 + 126);
var offset = trail < 26 ? 0x41 : trail < 26 + 26 ? 0x47 : 0x4D;
return output_byte_stream.emit(lead, trail + offset);
}
pointer = pointer - (26 + 26 + 126) * (0xC7 - 0x81);
lead = div(pointer, 94) + 0xC7;
trail = pointer % 94 + 0xA1;
return output_byte_stream.emit(lead, trail);
};
}
/** @param {{fatal: boolean}} options */
name_to_encoding['euc-kr'].getEncoder = function(options) {
return new EUCKREncoder(options);
};
/** @param {{fatal: boolean}} options */
name_to_encoding['euc-kr'].getDecoder = function(options) {
return new EUCKRDecoder(options);
};
//
// 14. Legacy miscellaneous encodings
//
// 14.1 replacement
// Not needed - API throws TypeError
// 14.2 utf-16
/**
* @constructor
* @param {boolean} utf16_be True if big-endian, false if little-endian.
* @param {{fatal: boolean}} options
*/
function UTF16Decoder(utf16_be, options) {
var fatal = options.fatal;
var /** @type {?number} */ utf16_lead_byte = null,
/** @type {?number} */ utf16_lead_surrogate = null;
/**
* @param {ByteInputStream} byte_pointer The byte stream to decode.
* @return {?number} The next code point decoded, or null if not enough
* data exists in the input stream to decode a complete code point.
*/
this.decode = function(byte_pointer) {
var bite = byte_pointer.get();
if (bite === EOF_byte && utf16_lead_byte === null &&
utf16_lead_surrogate === null) {
return EOF_code_point;
}
if (bite === EOF_byte && (utf16_lead_byte !== null ||
utf16_lead_surrogate !== null)) {
return decoderError(fatal);
}
byte_pointer.offset(1);
if (utf16_lead_byte === null) {
utf16_lead_byte = bite;
return null;
}
var code_point;
if (utf16_be) {
code_point = (utf16_lead_byte << 8) + bite;
} else {
code_point = (bite << 8) + utf16_lead_byte;
}
utf16_lead_byte = null;
if (utf16_lead_surrogate !== null) {
var lead_surrogate = utf16_lead_surrogate;
utf16_lead_surrogate = null;
if (inRange(code_point, 0xDC00, 0xDFFF)) {
return 0x10000 + (lead_surrogate - 0xD800) * 0x400 +
(code_point - 0xDC00);
}
byte_pointer.offset(-2);
return decoderError(fatal);
}
if (inRange(code_point, 0xD800, 0xDBFF)) {
utf16_lead_surrogate = code_point;
return null;
}
if (inRange(code_point, 0xDC00, 0xDFFF)) {
return decoderError(fatal);
}
return code_point;
};
}
/**
* @constructor
* @param {boolean} utf16_be True if big-endian, false if little-endian.
* @param {{fatal: boolean}} options
*/
function UTF16Encoder(utf16_be, options) {
var fatal = options.fatal;
/**
* @param {ByteOutputStream} output_byte_stream Output byte stream.
* @param {CodePointInputStream} code_point_pointer Input stream.
* @return {number} The last byte emitted.
*/
this.encode = function(output_byte_stream, code_point_pointer) {
/**
* @param {number} code_unit
* @return {number} last byte emitted
*/
function convert_to_bytes(code_unit) {
var byte1 = code_unit >> 8;
var byte2 = code_unit & 0x00FF;
if (utf16_be) {
return output_byte_stream.emit(byte1, byte2);
}
return output_byte_stream.emit(byte2, byte1);
}
var code_point = code_point_pointer.get();
if (code_point === EOF_code_point) {
return EOF_byte;
}
code_point_pointer.offset(1);
if (inRange(code_point, 0xD800, 0xDFFF)) {
encoderError(code_point);
}
if (code_point <= 0xFFFF) {
return convert_to_bytes(code_point);
}
var lead = div((code_point - 0x10000), 0x400) + 0xD800;
var trail = ((code_point - 0x10000) % 0x400) + 0xDC00;
convert_to_bytes(lead);
return convert_to_bytes(trail);
};
}
// 14.3 utf-16be
/** @param {{fatal: boolean}} options */
name_to_encoding['utf-16be'].getEncoder = function(options) {
return new UTF16Encoder(true, options);
};
/** @param {{fatal: boolean}} options */
name_to_encoding['utf-16be'].getDecoder = function(options) {
return new UTF16Decoder(true, options);
};
// 14.4 utf-16le
/** @param {{fatal: boolean}} options */
name_to_encoding['utf-16le'].getEncoder = function(options) {
return new UTF16Encoder(false, options);
};
/** @param {{fatal: boolean}} options */
name_to_encoding['utf-16le'].getDecoder = function(options) {
return new UTF16Decoder(false, options);
};
// 14.5 x-user-defined
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function XUserDefinedDecoder(options) {
var fatal = options.fatal;
/**
* @param {ByteInputStream} byte_pointer The byte stream to decode.
* @return {?number} The next code point decoded, or null if not enough
* data exists in the input stream to decode a complete code point.
*/
this.decode = function(byte_pointer) {
var bite = byte_pointer.get();
if (bite === EOF_byte) {
return EOF_code_point;
}
byte_pointer.offset(1);
if (inRange(bite, 0x00, 0x7F)) {
return bite;
}
return 0xF780 + bite - 0x80;
};
}
/**
* @constructor
* @param {{fatal: boolean}} options
*/
function XUserDefinedEncoder(options) {
var fatal = options.fatal;
/**
* @param {ByteOutputStream} output_byte_stream Output byte stream.
* @param {CodePointInputStream} code_point_pointer Input stream.
* @return {number} The last byte emitted.
*/
this.encode = function(output_byte_stream, code_point_pointer) {
var code_point = code_point_pointer.get();
if (code_point === EOF_code_point) {
return EOF_byte;
}
code_point_pointer.offset(1);
if (inRange(code_point, 0x0000, 0x007F)) {
return output_byte_stream.emit(code_point);
}
if (inRange(code_point, 0xF780, 0xF7FF)) {
return output_byte_stream.emit(code_point - 0xF780 + 0x80);
}
return encoderError(code_point);
};
}
/** @param {{fatal: boolean}} options */
name_to_encoding['x-user-defined'].getEncoder = function(options) {
return new XUserDefinedEncoder(options);
};
/** @param {{fatal: boolean}} options */
name_to_encoding['x-user-defined'].getDecoder = function(options) {
return new XUserDefinedDecoder(options);
};
// NOTE: currently unused
/**
* @param {string} label The encoding label.
* @param {ByteInputStream} input_stream The byte stream to test.
*/
function detectEncoding(label, input_stream) {
if (input_stream.match([0xFF, 0xFE])) {
input_stream.offset(2);
return 'utf-16le';
}
if (input_stream.match([0xFE, 0xFF])) {
input_stream.offset(2);
return 'utf-16be';
}
if (input_stream.match([0xEF, 0xBB, 0xBF])) {
input_stream.offset(3);
return 'utf-8';
}
return label;
}
if (!('TextEncoder' in global)) global['TextEncoder'] = TextEncoder;
if (!('TextDecoder' in global)) global['TextDecoder'] = TextDecoder;
}(this));