diff --git a/doc/encoding.rdoc b/doc/encoding.rdoc new file mode 100644 index 0000000000..6f663b14cd --- /dev/null +++ b/doc/encoding.rdoc @@ -0,0 +1,260 @@ +== \Encoding + +=== The Basics + +A {character encoding}[https://en.wikipedia.org/wiki/Character_encoding], +often shortened to _encoding_, is a mapping between: + +- A sequence of 8-bit bytes (each byte in the range 0..255). +- Characters in a specific character set. + +Some character sets contain only 1-byte characters; +{US-ASCII}[https://en.wikipedia.org/wiki/ASCII], for example, has 256 1-byte characters. +This string, encoded in US-ASCII, has six characters that are stored as six bytes: + + s = 'Hello!'.encode('US-ASCII') # => "Hello!" + s.encoding # => # + s.bytes # => [72, 101, 108, 108, 111, 33] + +Other encodings may involve multi-byte characters. +{UTF-8}[https://en.wikipedia.org/wiki/UTF-8], for example, +encodes more than one million characters, encoding each in one to four bytes. +The lowest-valued of these characters correspond to ASCII characters, +and so are 1-byte characters: + + s = 'Hello!' # => "Hello!" + s.bytes # => [72, 101, 108, 108, 111, 33] + +Other characters, such as the Euro symbol, are multi-byte: + + s = "\u20ac" # => "€" + s.bytes # => [226, 130, 172] + +=== The \Encoding \Class + +==== \Encoding Objects + +Ruby encodings are defined by constants in class \Encoding. +There can be only one instance of \Encoding for each of these constants. +\Method Encoding.list returns an array of \Encoding objects (one for each constant): + + Encoding.list.size # => 103 + Encoding.list.first.class # => Encoding + Encoding.list.take(3) + # => [#, #, #] + +==== Names and Aliases + +\Method Encoding#name returns the name of an \Encoding: + + Encoding::ASCII_8BIT.name # => "ASCII-8BIT" + Encoding::WINDOWS_31J.name # => "Windows-31J" + +An \Encoding object has zero or more aliases; +method Encoding#names returns an array containing the name and all aliases: + + Encoding::ASCII_8BIT.names + # => ["ASCII-8BIT", "BINARY"] + Encoding::WINDOWS_31J.names + #=> ["Windows-31J", "CP932", "csWindows31J", "SJIS", "PCK"] + +\Method Encoding.aliases returns a hash of all alias/name pairs: + + Encoding.aliases.size # => 71 + Encoding.aliases.take(3) + # => [["BINARY", "ASCII-8BIT"], ["CP437", "IBM437"], ["CP720", "IBM720"]] + +\Method Encoding.name_list returns an array of all the encoding names and aliases: + + Encoding.name_list.size # => 175 + Encoding.name_list.take(3) + # => ["ASCII-8BIT", "UTF-8", "US-ASCII"] + +\Method +name_list+ returns more entries than method +list+ +because it includes both the names and their aliases. + +\Method Encoding.find returns the \Encoding for a given name or alias, if it exists: + + Encoding.find("US-ASCII") # => # + Encoding.find("US-ASCII").class # => Encoding + +==== Default Encodings + +\Method Encoding.find, above, also returns a default \Encoding +for each of these special names: + +- +external+: the default external \Encoding: + + Encoding.find("external") # => # + +- +internal+: the default internal \Encoding (may be +nil+): + + Encoding.find("internal") # => nil + +- +locale+: the default \Encoding for a string from the environment: + + Encoding.find("locale") # => # # Linux + Encoding.find("locale") # => # # Windows + +- +filesystem+: the default \Encoding for a string from the filesystem: + + Encoding.find("filesystem") # => # + +\Method Encoding.default_external returns the default external \Encoding: + + Encoding.default_external # => # + +\Method Encoding.default_external= sets that value: + + Encoding.default_external = 'US-ASCII' # => "US-ASCII" + Encoding.default_external # => # + +\Method Encoding.default_internal returns the default internal \Encoding: + + Encoding.default_internal # => nil + +\Method Encoding.default_internal= sets the default internal \Encoding: + + Encoding.default_internal = 'US-ASCII' # => "US-ASCII" + Encoding.default_internal # => # + +==== Compatible Encodings + +\Method Encoding.compatible? returns whether two given objects are encoding-compatible +(that is, whether they can be concatenated); +returns the \Encoding of the concatenated string, or +nil+ if incompatible: + + rus = "\u{442 435 441 442}" + eng = 'text' + Encoding.compatible?(rus, eng) # => # + + s0 = "\xa1\xa1".force_encoding('iso-8859-1') # => "\xA1\xA1" + s1 = "\xa1\xa1".force_encoding('euc-jp') # => "\x{A1A1}" + Encoding.compatible?(s0, s1) # => nil + +==== \Encoding Options + +A number of methods in the Ruby core accept keyword arguments as encoding options. + +Some of the options specify or utilize a _replacement_ _string_, to be used +in certain transcoding operations. +A replacement string may be in any encoding that can be converted +to the encoding of the destination string. + +These keyword-value pairs specify encoding options: + +- For an invalid byte sequence: + + - :invalid: nil (default): Raise exception. + - :invalid: :replace: Replace each invalid byte sequence + with the replacement string. + + Examples: + + s = "\x80foo\x80" + s.encode('ISO-8859-3') # Raises Encoding::InvalidByteSequenceError. + s.encode('ISO-8859-3', invalid: :replace) # => "?foo?" + +- For an undefined character: + + - :undef: nil (default): Raise exception. + - :undef: :replace: Replace each undefined character + with the replacement string. + + Examples: + + s = "\x80foo\x80" + "\x80".encode('UTF-8', 'ASCII-8BIT') # Raises Encoding::UndefinedConversionError. + s.encode('UTF-8', 'ASCII-8BIT', undef: :replace) # => "�foo�" + + +- Replacement string: + + - :replace: nil (default): Set replacement string to default value: + "\uFFFD" ("�") for a Unicode encoding, '?' otherwise. + - :replace: _some_string_: Set replacement string to the given +some_string+; + overrides +:fallback+. + + Examples: + + s = "\xA5foo\xA5" + options = {:undef => :replace, :replace => 'xyzzy'} + s.encode('UTF-8', 'ISO-8859-3', **options) # => "xyzzyfooxyzzy" + +- Replacement fallback: + + One of these may be specified: + + - :fallback: nil (default): No replacement fallback. + - :fallback: _hash_like_object_: Set replacement fallback to the given + +hash_like_object+; the replacement string is _hash_like_object_[X]. + - :fallback: _method_: Set replacement fallback to the given + +method+; the replacement string is _method_(X). + - :fallback: _proc_: Set replacement fallback to the given + +proc+; the replacement string is _proc_[X]. + + Examples: + + s = "\u3042foo\u3043" + + hash = {"\u3042" => 'xyzzy'} + hash.default = 'XYZZY' + s.encode('ASCII', fallback: h) # => "xyzzyfooXYZZY" + + def (fallback = "U+%.4X").escape(x) + self % x.unpack("U") + end + "\u{3042}".encode("US-ASCII", fallback: fallback.method(:escape)) # => "U+3042" + + proc = Proc.new {|x| x == "\u3042" ? 'xyzzy' : 'XYZZY' } + s.encode('ASCII', fallback: proc) # => "XYZZYfooXYZZY" + +- XML entities: + + One of these may be specified: + + - :xml: nil (default): No handling for XML entities. + - :xml: :text: Treat source text as XML; + replace each undefined character + with its upper-case hexdecimal numeric character reference, + except that: + + - & is replaced with &. + - < is replaced with <. + - > is replaced with >. + + - :xml: :attr: Treat source text as XML attribute value; + replace each undefined character + with its upper-case hexdecimal numeric character reference, + except that: + + - The replacement string r is double-quoted ("r"). + - Each embedded double-quote is replaced with ". + - & is replaced with &. + - < is replaced with <. + - > is replaced with >. + + Examples: + + s = 'foo"<&>"bar' + "\u3042" + s.encode('ASCII', xml: :text) # => "foo\"<&>\"barあ" + s.encode('ASCII', xml: :attr) # => "\"foo"<&>"barあ\"" + + +- Newlines: + + One of these may be specified: + + - :cr_newline: true: Replace each line-feed character ("\n") + with a carriage-return character ("\r"). + - :crlf_newline: true: Replace each line-feed character ("\n") + with a carriage-return/line-feed string ("\r\n"). + - :universal_newline: true: Replace each carriage-return/line-feed string + ("\r\n") with a line-feed character ("\n"). + + Examples: + + s = "\n \r\n" # => "\n \r\n" + s.encode('ASCII', cr_newline: true) # => "\r \r\r" + s.encode('ASCII', crlf_newline: true) # => "\r\n \r\r\n" + s.encode('ASCII', universal_newline: true) # => "\n \n"