v0.5.1 + NewReader

2020-02-13 01:39:44 +04:00 · 2020-02-13 01:39:44 +04:00 · 0d40555dad
--- a/HIST.md
+++ b/HIST.md
@ -41,3 +41,14 @@ _____________________________
   - string UTF32 w/o bom and w/o russian char detect as UTF16

 _____________________________
+
+## ver 0.4.1 // 2020.02.05 ##
+
+* add function NewReader() - convertion to UTF8 with automatic detection
+* add function NewReaderCP() - convertion from UTF8 to the specified codepage
+
+### todo ###
+
+   - string UTF32 w/o bom and w/o russian char detect as UTF16
+
+_____________________________
--- a/README-RU.md
+++ b/README-RU.md
@ -10,18 +10,18 @@
 предназначена для автоматического определения кодовой страницы текстовых файлов или потоков байт  
 поддерживает следующие кодовые страницы:

-no ID                Name		uint16
-
-1. ASCII:            "ASCII",		3
-2. ISOLatinCyrillic: "ISO-8859-5",	8
-3. CP866:            "CP866",		2086
-4. Windows1251:      "Windows-1251",	2251
-5. UTF8:             "UTF-8",		106
-6. UTF16LE:          "UTF-16LE",	1014
-7. UTF16BE:          "UTF-16BE",	1013
-8. KOI8R:            "KOI8-R",		2084
-9. UTF32LE:          "UTF-32LE",	1019
-10. UTF32BE:         "UTF-32BE",	1018
+| no | ID               | Name           | uint16  |
+| -- | ---------------- | -------------- | ------- |
+| 1. | ASCII            | "ASCII"        |      3  |
+| 2. | ISOLatinCyrillic | "ISO-8859-5"   |      8  |
+| 3. | CP866            | "CP866"        |   2086  |
+| 4. | Windows1251      | "Windows-1251" |   2251  |
+| 5. | UTF8             | "UTF-8"        |    106  |
+| 6. | UTF16LE          | "UTF-16LE"     |   1014  |
+| 7. | UTF16BE          | "UTF-16BE"     |   1013  |
+| 8. | KOI8R            | "KOI8-R"       |   2084  |
+| 9. | UTF32LE          | "UTF-32LE"     |   1019  |
+| 10.| UTF32BE:         | "UTF-32BE"     |   1018  |

 ## особенности ##

@ -38,14 +38,13 @@ no ID                Name		uint16
 >"golang.org/x/text/encoding/charmap"  
 >"golang.org/x/text/transform"  

-
 ## типы ##

 IDCodePage uint16 - индекс кодовой страницы, значения взяты из файла поставки golang golang.org\x\text\encoding\internal\identifier\mib.go
 поддерживается interface String(), и можно выводить так
    cp := cpd.UTF8
    fmt.Printf("code page index, name: %d, %s\n", cp, cp)
-    //>code page index, name: 106, UTF-8
+    >>code page index, name: 106, UTF-8

 ## глобальные переменные ##

@ -53,29 +52,57 @@ ReadBufSize int = 1024 // количество байт считываемых

 ## функции ##

-1. CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error)
-2. FileCodePageDetect(fn string, stopStr ...string) (IDCodePage, error)
+1. CodepageAutoDetect(b []byte) IDCodePage
+2. CodepageDetect(r io.Reader, stopStr ...string) (IDCodePage, error)
+3. FileCodepageDetect(fn string, stopStr ...string) (IDCodePage, error)
+4. NewReader(r io.Reader, cpn ...string) (io.Reader, error)
+5. NewReaderTo((r io.Reader, cpn string) (io.Reader, error)
+6. SupportedEncoder(cpn string) bool

 ## описание ##

-    func CodePageAutoDetect(content []byte) (result IDCodePage) 
+    CodepageAutoDetect(content []byte) (result IDCodePage) 
      автоматическое определеие кодировки по входному слайсу байт
      использовать вместо golang.org/x/net/html/charset.DetermineEncoding()

-    CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error)
+    CodepageDetect(r io.Reader, stopStr ...string) (IDCodePage, error)
      определяет кодовую страницу считывая поток байтов из 'r' 
      используется 'reflect.ValueOf(r).IsValid()' для проверки 'r' на существование
      считывает из 'r' первые ReadBufSize байтов
      параметр stopStr пока не используется

-    FileCodePageDetect(fn string, stopStr ...string) (IDCodePage, error)
+    FileCodepageDetect(fn string, stopStr ...string) (IDCodePage, error)
      определяет кодовую страницу считывая файл 'fn', считывает из файла первые ReadBufSize байтов
      ошибку возвращает если проблемы с открытием файла 'fn'
      возвращает cpd.ASCII если колировка не определена
+    
+    NewReader(r io.Reader, cpn ...string) (io.Reader, error)
+      конвертация из указанной кодировки в UTF-8
+      r - ридер из которого читаем
+      cpn - имя кодировки в которой представлены входные данные, необязательный параметр
+      создаёт новый io.Reader, чтение из которого будет в формате UTF-8,
+      входная кодировка определяется автоматически, либо можно задать имя в параметре cpn
+      если имя входной кодировки неверное (отсутствует в словаре) то выполняется автоопределение
+      может вернуть ошибку чтения из входного ридера, либо ошибку неизвестной кодировки (кодировка из которой невозможно преобразовать в UTF-8)
+
+    NewReaderTo(r io.Reader, cpn string) (io.Reader, error)
+      конвертация из UTF-8 в целевую кодировку 
+      r - ридер из которого читаем, обязательно в UTF-8
+      cpn - имя кодировки в которую преобразуем данные
+      создаёт новый io.Reader, чтение из которого будет в кодировке cpn,
+      может вернуть ошибку чтения из входного ридера, либо ошибку неизвестной выходной кодировки
+
+    SupportedEncoder(cpn string) bool
+      проверка кодировки на возможность преобразования

 ## tests & static analiz ##

-coverage: 89% of statements  
+coverage: 89.8%  
 в папке "test_files" лежат файлы для тестов, соответственно не править и не удалять
+в папке "sample" примеры
+
+1. tohex -- подаём строку и желаемую кодировку, получаем шестнадцатеричные коды символов строки в указанной кодировке. пример боевой, полученную строку можно забрать и вставить в код golang
+2. detect-all-files -- выводит кодировку всех файлов найденных в текущем каталоге с указанным расширением
+3. cpname -- пример работы с именами кодировок и прохода по всем кодировкам

 linter.md отчёт статического анализатора golangci-lint
--- a/README.md
+++ b/README.md
@ -8,18 +8,18 @@
 golang library for detecting code page of text files  
 multibyte code pages and single-byte Russian code pages are supported:

-no ID                Name		uint16
-
-1. ASCII:            "ASCII",		3
-2. ISOLatinCyrillic: "ISO-8859-5",	8
-3. CP866:            "CP866",		2086
-4. Windows1251:      "Windows-1251",	2251
-5. UTF8:             "UTF-8",		106
-6. UTF16LE:          "UTF-16LE",	1014
-7. UTF16BE:          "UTF-16BE",	1013
-8. KOI8R:            "KOI8-R",		2084
-9. UTF32LE:          "UTF-32LE",	1019
-10. UTF32BE:         "UTF-32BE",	1018
+| no | ID               | Name           | uint16  |
+| -- | ---------------- | -------------- | ------- |
+| 1. | ASCII            | "ASCII"        |      3  |
+| 2. | ISOLatinCyrillic | "ISO-8859-5"   |      8  |
+| 3. | CP866            | "CP866"        |   2086  |
+| 4. | Windows1251      | "Windows-1251" |   2251  |
+| 5. | UTF8             | "UTF-8"        |    106  |
+| 6. | UTF16LE          | "UTF-16LE"     |   1014  |
+| 7. | UTF16BE          | "UTF-16BE"     |   1013  |
+| 8. | KOI8R            | "KOI8-R"       |   2084  |
+| 9. | UTF32LE          | "UTF-32LE"     |   1019  |
+| 10.| UTF32BE:         | "UTF-32BE"     |   1018  |

 ## feature ##

--- a/code_pages.go
+++ b/code_pages.go
@ -1,8 +1,10 @@
 package cpd

 import (
+	"bufio"
 	"bytes"
 	"fmt"
+	"io"
 	"strings"
 )

@ -32,8 +34,45 @@ func (i IDCodePage) DeleteBom(s string) (res string) {
 	return res
 }

+// BomLen - return lenght in bytes of BOM for this
+// for codepage no have Bom, return 0
+func (i IDCodePage) BomLen() int {
+	for _, b := range Boms {
+		if b.id == i {
+			return len(b.Bom)
+		}
+	}
+	return 0
+}
+
+// ReaderHasBom - check reader to BOM prefix
+func (i IDCodePage) ReaderHasBom(r io.Reader) bool {
+	buf, err := bufio.NewReader(r).Peek(i.BomLen())
+	if err != nil {
+		return false
+	}
+	return bytes.HasPrefix(buf, codepageDic[i].Boms)
+}
+
+// DeleteBomFromReader - return reader after removing BOM from it
+func (i IDCodePage) DeleteBomFromReader(r io.Reader) io.Reader {
+	if i.ReaderHasBom(r) {
+		r.Read(make([]byte, UTF8.BomLen())) // считываем в никуда количество байт занимаемых BOM этой кодировки
+	}
+	return r
+}
+
+// codepageByName - search and return codepage id by name
+func codepageByName(name string) IDCodePage {
+	id, ok := nameMap[strings.ToLower(strings.TrimSpace(name))]
+	if !ok {
+		return ASCII
+	}
+	return id
+}
+
 // matcher - return struct MatchRes - two criterion
-// this function must be realised in each code page
+// this function must be realised in each codepage
 type matcher func(data []byte, tbl *cpTable) MatchRes

 // container - return true if b contain in
--- a/code_pages_id.go
+++ b/code_pages_id.go
@ -87,3 +87,42 @@ const (
 	// Microsoft http://www.iana.org/assignments/charset-reg/windows-1252
 	Windows1252 IDCodePage = 2252
 )
+
+var nameMap = map[string]IDCodePage{
+	"unicode-1-1-utf-8":  UTF8,
+	"utf-8":              UTF8,
+	"utf8":               UTF8,
+	"866":                CP866,
+	"cp-866":             CP866,
+	"cp866":              CP866,
+	"csibm866":           CP866,
+	"ibm866":             CP866,
+	"csisolatincyrillic": ISOLatinCyrillic,
+	"cyrillic":           ISOLatinCyrillic,
+	"iso-8859-5":         ISOLatinCyrillic,
+	"iso-ir-144":         ISOLatinCyrillic,
+	"iso8859-5":          ISOLatinCyrillic,
+	"iso88595":           ISOLatinCyrillic,
+	"iso_8859-5":         ISOLatinCyrillic,
+	"iso_8859-5:1988":    ISOLatinCyrillic,
+	"cskoi8r":            KOI8R,
+	"koi":                KOI8R,
+	"koi8":               KOI8R,
+	"koi8-r":             KOI8R,
+	"koi8_r":             KOI8R,
+	"koi8-ru":            KOI8R,
+	"koi8-u":             KOI8R,
+	"cp1251":             CP1251,
+	"cp-1251":            CP1251,
+	"win1251":            CP1251,
+	"win-1251":           CP1251,
+	"windows-1251":       CP1251,
+	"windows1251":        CP1251,
+	"x-cp1251":           CP1251,
+	"utf-16be":           UTF16BE,
+	"utf16be":            UTF16BE,
+	"utf-16":             UTF16LE,
+	"utf16":              UTF16LE,
+	"utf-16le":           UTF16LE,
+	"utf16le":            UTF16LE,
+}
--- a/cpd.go
+++ b/cpd.go
@ -4,24 +4,27 @@ package cpd

 import (
 	"bufio"
-	"bytes"
+	"errors"
 	"fmt"
 	"io"
 	"os"
 	"unicode"
-	"unicode/utf16"
-	"unicode/utf8"

 	"golang.org/x/text/encoding/charmap"
+	"golang.org/x/text/encoding/htmlindex"
 	"golang.org/x/text/transform"
 )

 // ReadBufSize - byte count for reading from file, func FileCodePageDetect()
 var ReadBufSize int = 1024

-// FileCodepageDetect - detect code page of text file
-func FileCodepageDetect(fn string, stopStr ...string) (IDCodePage, error) {
+// SupportedEncoder - check codepage name
+func SupportedEncoder(cpn string) bool {
+	return codepageByName(cpn) != ASCII
+}

+// FileCodepageDetect - detect codepage of text file
+func FileCodepageDetect(fn string, stopStr ...string) (IDCodePage, error) {
 	iFile, err := os.Open(fn)
 	if err != nil {
 		return ASCII, err
@ -57,18 +60,14 @@ func CodepageAutoDetect(b []byte) IDCodePage {
 // FileConvertCodepage - replace code page text file from one to another
 // support convert only from/to Windows1251/IBM866
 func FileConvertCodepage(fileName string, fromCP, toCP IDCodePage) error {
-	if fromCP == toCP {
+	switch {
+	case (fromCP == toCP):
+		return nil
+	case (fromCP != CP1251) && (fromCP != CP866):
+		return nil
+	case (toCP != CP1251) && (toCP != CP866):
 		return nil
 	}
-
-	if (fromCP != CP1251) && (fromCP != CP866) {
-		return nil
-	}
-
-	if (toCP != CP1251) && (toCP != CP866) {
-		return nil
-	}
-
 	iFile, err := os.Open(fileName)
 	if err != nil {
 		return err
@ -111,44 +110,6 @@ func CodepageAsString(codepage IDCodePage) string {
 	return codepageDic[codepage].name
 }

-// DecodeUTF16le - decode slice of byte from UTF16 to UTF8
-func DecodeUTF16le(s string) string {
-	if len(s) == 0 {
-		return ""
-	}
-	s = UTF16LE.DeleteBom(s)
-	b := []byte(s)
-	u16s := make([]uint16, 1)
-	ret := &bytes.Buffer{}
-	b8buf := make([]byte, 4)
-	for i := 0; i < len(b); i += 2 {
-		u16s[0] = uint16(b[i]) + (uint16(b[i+1]) << 8)
-		r := utf16.Decode(u16s)
-		n := utf8.EncodeRune(b8buf, r[0])
-		ret.Write(b8buf[:n])
-	}
-	return ret.String()
-}
-
-// DecodeUTF16be - decode slice of byte from UTF16 to UTF8
-func DecodeUTF16be(s string) string {
-	if len(s) == 0 {
-		return ""
-	}
-	s = UTF16BE.DeleteBom(s)
-	b := []byte(s)
-	u16s := make([]uint16, 1)
-	ret := &bytes.Buffer{}
-	b8buf := make([]byte, 4)
-	for i := 0; i < len(b); i += 2 {
-		u16s[0] = uint16(b[i+1]) + (uint16(b[i]) << 8)
-		r := utf16.Decode(u16s)
-		n := utf8.EncodeRune(b8buf, r[0])
-		ret.Write(b8buf[:n])
-	}
-	return ret.String()
-}
-
 // StrConvertCodepage - convert string from one code page to another
 // function for future, at now support convert only from/to Windows1251/IBM866
 func StrConvertCodepage(s string, fromCP, toCP IDCodePage) (string, error) {
@ -175,3 +136,75 @@ func StrConvertCodepage(s string, fromCP, toCP IDCodePage) (string, error) {
 	}
 	return s, err
 }
+
+func checkBomExist(r io.Reader) bool {
+	buf, _ := bufio.NewReader(r).Peek(4)
+	_, res := CheckBOM(buf)
+	return res
+}
+
+var (
+	errUnknown                   = errors.New("htmlindex: unknown Encoding")
+	errInputIsNil                = errors.New("cpd: input reader is nil")
+	errUnsupportedCodepage       = errors.New("cpd: codepage not support encode/decode")
+	errUnsupportedOutputCodepage = errors.New("cpd: output codepage not support encode")
+)
+
+// NewReader - convertion to UTF-8
+// return input reader if input contain less 4 bytes
+// return input reader if input contain ASCII data
+// if cpn[0] exist, then using it as input codepage name
+func NewReader(r io.Reader, cpn ...string) (io.Reader, error) {
+	if r == nil {
+		return r, errInputIsNil
+	}
+	tmpReader := bufio.NewReader(r)
+	var err error
+	cp := ASCII
+	if len(cpn) > 0 {
+		cp = codepageByName(cpn[0])
+	}
+	if cp == ASCII {
+		cp, err = CodepageDetect(tmpReader)
+	}
+	//TODO внимательно нужно посмотреть что может вернуть CodepageDetect()
+	//эти случаи обработать, например через func unsupportedCodepageToDecode(cp)
+	switch {
+	case (cp == UTF32) || (cp == UTF32BE) || (cp == UTF32LE):
+		return r, errUnsupportedCodepage
+	case cp == ASCII: // кодировку определить не удалось, неизвестную кодировку возвращаем как есть
+		return r, errUnknown
+	case err != nil: // и если ошибка при чтении, то возвращаем как есть
+		return r, err
+	}
+
+	if checkBomExist(tmpReader) {
+		//ошибку не обрабатываем, если мы здесь, то эти байты мы уже читали
+		tmpReader.Read(make([]byte, cp.BomLen())) // считываем в никуда количество байт занимаемых BOM этой кодировки
+	}
+	if cp == UTF8 {
+		return tmpReader, nil // когда удалили BOM тогда можно вернуть UTF-8, ведь его конвертировать не нужно
+	}
+	//ошибку не обрабатываем, htmlindex.Get() возвращает ошибку только если не найдена кодировка, здесь это уже невозможно
+	//здесь cp может содержать только кодировки имеющиеся в htmlindex
+	e, _ := htmlindex.Get(cp.String())
+	r = transform.NewReader(tmpReader, e.NewDecoder())
+	return r, nil
+}
+
+// NewReaderTo - creates a new reader encoding from UTF-8 to the specified codepage
+// return input reader and error if output codepage not found, or unsupport encoding
+// if input str contains the BOM char, then BOM be deleted
+func NewReaderTo(r io.Reader, cpn string) (io.Reader, error) {
+	cpTo := codepageByName(cpn)
+	if cpTo == ASCII {
+		return r, errUnsupportedOutputCodepage
+	}
+	tmpReader := UTF8.DeleteBomFromReader(bufio.NewReader(r))
+	if cpTo == UTF8 {
+		return tmpReader, nil
+	}
+	e, _ := htmlindex.Get(cpTo.String())
+	r = transform.NewReader(tmpReader, e.NewEncoder())
+	return r, nil
+}
--- a/cpd_test.go
+++ b/cpd_test.go
@ -2,6 +2,8 @@

 import (
 	"fmt"
+	"io"
+	"io/ioutil"
 	"os"
 	fp "path/filepath"
 	"strings"
@ -21,15 +23,15 @@ var dStringHasBom = []tStringHasBom{
 	{0, "", false},
 	{ASCII, "", false},
 	{CP866, "CP866", false},
-	{CP1251, string([]byte{0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), false},
-	{CP1251, string([]byte{0xff, 0xfe, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), false},     // contain UTF16LE bom, false because CP1251 have no bom
-	{UTF8, string([]byte{0xef, 0xbb, 0xbf, 0xD0, 0x94, 0xD0, 0xB5, 0xD0, 0xB4}), true},  // Дед UTF8 with bom
-	{UTF8, string([]byte{0xef, 0xbb, 0xbe, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), false}, // UTF8 without bom
-	{UTF8, string([]byte{0xff, 0xbb, 0xbe, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), false}, // UTF8 without bom
-	{UTF16BE, string([]byte{0xfe, 0xff, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), true},
-	{UTF16LE, string([]byte{0xff, 0xfe, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), true},
-	{UTF32BE, string([]byte{0x00, 0x00, 0xfe, 0xff, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), true},
-	{UTF32LE, string([]byte{0xff, 0xfe, 0x00, 0x00, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), true},
+	{CP1251, "\xD0\xEE\xF1\xF1\xE8\xFF", false},
+	{CP1251, "\xff\xfe\xD0\xEE\xF1\xF1\xE8\xFF", false},   // contain UTF16LE bom, false because CP1251 have no bom
+	{UTF8, "\xef\xbb\xbf\xD0\x94\xD0\xB5\xD0\xB4", true},  // Дед UTF8 with bom
+	{UTF8, "\xef\xbb\xbe\xD0\xEE\xF1\xF1\xE8\xFF", false}, // UTF8 without bom
+	{UTF8, "\xff\xbb\xbe\xD0\xEE\xF1\xF1\xE8\xFF", false}, // UTF8 without bom
+	{UTF16BE, "\xfe\xff\xD0\xEE\xF1\xF1\xE8\xFF", true},
+	{UTF16LE, "\xff\xfe\xD0\xEE\xF1\xF1\xE8\xFF", true},
+	{UTF32BE, "\x00\x00\xfe\xff\xD0\xEE\xF1\xF1\xE8\xFF", true},
+	{UTF32LE, "\xff\xfe\x00\x00\xD0\xEE\xF1\xF1\xE8\xFF", true},
 }

 func TestStringHasBom(t *testing.T) {
@ -142,18 +144,17 @@ func TestFileCodePageDetectM(t *testing.T) {
 	wg.Wait()
 }

-//TestCodePageDetect - тестирование метода CodePageDetect
-// проверки на входные параметры:
-// 1. nil		входящий поток явный nil, параметр останова отсутствует
-// 2. nil, "~"	входящий поток явный nil, параметр останова присутствует
-// 3. входящий поток не инициализированный объект, проверка на передачу пустого интерфейса
+// TestCodePageDetect - тестирование метода CodePageDetect
+// проверки на входные параметры
 // проверка самой работы осуществляется через FileCodePageDetect()
 func TestCodePageDetect(t *testing.T) {
+	// 1. nil		входящий поток явный nil, параметр останова отсутствует
 	tmp, err := CodepageDetect(nil)
 	assert.Nil(t, err, fmt.Sprintf("<CodePageDetect> on input nil return error != nil\n"))
 	assert.Equal(t, tmp, ASCII, fmt.Sprintf("<CodePageDetect> on input nil return code page != ASCII\n"))

 	var data *os.File
+	// 2. nil, "~"	входящий поток явный nil, параметр останова присутствует
 	res, err := CodepageDetect(data)
 	assert.NotNil(t, err, fmt.Sprintf("<CodePageDetect> on empty io.Reader return error != nil, data: %+v, err: %v\n", data, err))
 	assert.Equal(t, res, ASCII, fmt.Sprintf("<CodePageDetect> on empty io.Reader = %+v return code page %s != ASCII\n", data, res))
@ -248,3 +249,147 @@ func TestDecodeUtf16be(t *testing.T) {
 		assert.Equal(t, d.oStr, s, fmt.Sprintf("test #%d not pass", i))
 	}
 }
+
+type tNewReader struct {
+	iStr   string // input string
+	oStr   string // result string
+	cpName string // codepage name
+}
+
+//здесь все тесты без ошибок
+//
+var dNewReader = []tNewReader{
+	{"\xef\xbb\xbf\xD0\x94\xD0\xB5\xD0\xB4", "Дед", ""},                // UTF8 with BOM     0
+	{"\xD0\x94\xD0\xB5\xD0\xB4", "Дед", ""},                            // UTF8 w/o BOM      1
+	{"\xFE\xFF\x04\x14\x04\x35\x04\x34", "Дед", ""},                    // UTF16be with BOM  2
+	{"\x04\x14\x04\x35\x04\x34\x00\x20\x04\x20\x04\x43", "Дед Ру", ""}, // UTF16be w/o BOM   3
+	{"\x04\x14\x04\x35\x04\x34", "Дед", "UTF-16BE"},                    // UTF16be w/o BOM   4
+	{"\xFF\xFE\x14\x04\x35\x04\x34\x04", "Дед", ""},                    // UTF16le with BOM  5
+	{"\x14\x04\x35\x04\x34\x04\x20\x00\x20\x04\x43\x04", "Дед Ру", ""}, // UTF16le w/o BOM   6
+	{"\x14\x04\x35\x04\x34\x04", "Дед", "UTF16le"},                     // UTF16le w/o BOM   7
+	{"\xD0\xEE\xF1\xF1\xE8\xFF", "Россия", ""},                         // CP1251            8
+	{"\xCE\xED\xEE", "Оно", ""},                                        // CP1251            9
+	{"\xCE\xED\xEE", "Оно", "w1251"},                                   // CP1251            10
+	{"\xCE\xED\xEE", "Оно", "win1251"},                                 // CP1251            11
+	{"1", "1", ""},                                                     // ascii string      12
+	{"1", "1", "CP866"},                                                // ascii string      13
+	{"1", "1", "CP/866"},                                               // ascii string      14
+	{"", "", "CP1251"},                                                 // ascii string      15
+	{"", "", "CP"},                                                     // ascii string      16
+	{"", "", ""},                                                       // ascii string      17
+}
+
+func TestNewReader(t *testing.T) {
+	var (
+		r   io.Reader
+		err error
+	)
+	for i, d := range dNewReader {
+		tmp := strings.NewReader(d.iStr)
+		if len(d.cpName) > 0 {
+			r, err = NewReader(tmp, d.cpName)
+		} else {
+			r, err = NewReader(tmp)
+		}
+		assert.Nil(t, err, fmt.Sprintf("test #%d not pass, NewReader() return error", i))
+		b, err := ioutil.ReadAll(r)
+		assert.Nil(t, err, fmt.Sprintf("test #%d not pass, ReadAll() return error", i))
+		assert.Equal(t, d.oStr, string(b), fmt.Sprintf("test #%d not pass", i))
+	}
+	//дополнительно тестирование nil ридера
+	r, err = NewReader(nil)
+	assert.Equal(t, err.Error(), "cpd: input reader is nil", "test NewReader(nil) return err==nil, expect: 'cpd: input reader is nil'")
+	assert.Nil(t, r, "test NewReader(nil) return r!=nil, expect: nil")
+}
+
+var dNewReaderErrors = []tNewReader{
+	{"\x00\x00\x04\x20\x00\x00\x04\x43\x00\x00\x04\x41\x00\x00\x04\x41", "cpd: codepage not support decode", "UTF-32Be"}, // UTF-32be Русс	  0
+	{"\x00\x00\x04\x20\x00\x00\x04\x43\x00\x00\x04\x41\x00\x00\x04\x41", "cpd: codepage not support decode", ""},         // UTF-32be Русс	  1
+	{"\x04\x20\x00\x00\x04\x43\x00\x00\x04\x41\x00\x00\x04\x41\x00\x00", "cpd: codepage not support decode", "UTF32le"},  // UTF-32le Русс	  2
+	{"\x04\x20\x00\x00\x04\x43\x00\x00\x04\x41\x00\x00\x04\x41\x00\x00", "cpd: codepage not support decode", ""},         // UTF-32le Русс	  3
+}
+
+// тестирование случаев с ошибкой
+func TestNewReaderError(t *testing.T) {
+	var (
+		r   io.Reader
+		err error
+	)
+	for i, d := range dNewReaderErrors {
+		tmp := strings.NewReader(d.iStr)
+		if len(d.cpName) > 0 {
+			r, err = NewReader(tmp, d.cpName)
+		} else {
+			r, err = NewReader(tmp)
+		}
+		// мы должны получить ошибку: 'cpd: codepage not support decode'
+		// кроме того вернётся поданый на вход ридер
+		assert.NotNil(t, err, fmt.Sprintf("test #%d not pass, NewReader() return nil error", i))
+		assert.Equal(t, tmp, r, fmt.Sprintf("test #%d not pass", i))
+	}
+}
+
+type tNewReaderTo struct {
+	iStr   string // input string
+	oStr   string // result string
+	cpName string // codepage name
+}
+
+var dNewReaderTo = []tNewReaderTo{
+	{"Дед", "\x84\xA5\xA4", "CP866"},                                                   // 0
+	{"Дед", "\x04\x14\x04\x35\x04\x34", "UTF16be"},                                     // 1
+	{"Дед Ру", "\x04\x14\x04\x35\x04\x34\x00\x20\x04\x20\x04\x43", "UTF-16be"},         // 2
+	{"Дед Ру", "\x14\x04\x35\x04\x34\x04\x20\x00\x20\x04\x43\x04", "UTF16Le"},          // 3
+	{"Дед", "\x14\x04\x35\x04\x34\x04", "UTF16le"},                                     // 4
+	{"Оно", "\xCE\xED\xEE", "CP-1251"},                                                 // 5
+	{"О н о", "\xCE\x20\xED\x20\xEE", "win1251"},                                       // 6
+	{"\xef\xbb\xbf\xD0\x9E\x20\xD0\xBD\x20\xD0\xBe", "\xCE\x20\xED\x20\xEE", "CP1251"}, // 7 input UTF8 with bom, convert to CP1251, BOM is expected to be deleted
+	{"\xef\xbb\xbf\xD0\x94\xD0\xB5\xD0\xB4", "Дед", "UTF8"},                            // 8 input UTF8 with bom, convert to UTF8, BOM is expected to be deleted
+	{"", "", "CP866"}, // empty input string but not empty exist codepage name  11
+}
+
+func TestNewReaderTo(t *testing.T) {
+	for i, d := range dNewReaderTo {
+		r, err := NewReaderTo(strings.NewReader(d.iStr), d.cpName)
+		assert.Nil(t, err, fmt.Sprintf("test #%d not pass, NewReaderTo() return error: '%v'", i, err))
+		b, err := ioutil.ReadAll(r)
+		assert.Nil(t, err, fmt.Sprintf("test #%d not pass, ReadAll() return error: '%v'", i, err))
+		assert.Equal(t, d.oStr, string(b), fmt.Sprintf("test #%d not pass", i))
+	}
+}
+
+// тестирование случаев с ошибкой
+var dNewReaderToError = []tNewReaderTo{
+	{"sample", "cpd: output codepage not support encode", "UTF-32"}, // 0 landing codepage not support encoding
+	{"1234", "cpd: output codepage not support encode", ""},         // 2 empty landing codepage submitted
+	{"Дед Ру", "cpd: output codepage not support encode", "cpMY"},   // 3 not exist landing codepage submitted
+	{"", "cpd: output codepage not support encode", "<CP>"},         // 4 empty input string but not empty landing codepage submitted
+	{"", "cpd: output codepage not support encode", ""},             // 5 empty input string and empty landing codepage submitted
+}
+
+func TestNewReaderToError(t *testing.T) {
+	for i, d := range dNewReaderToError {
+		_, err := NewReaderTo(strings.NewReader(d.iStr), d.cpName)
+		assert.Equal(t, d.oStr, err.Error(), fmt.Sprintf("test #%d. error expect: %s, got: %s", i, d.oStr, err.Error()))
+	}
+}
+
+type tSupportedEncoders struct {
+	cpn string // input codepage name
+	res bool   // result
+}
+
+var dSupportedEncoders = []tSupportedEncoders{
+	{"", false},        // 0
+	{"<>", false},      // 1
+	{"CP866", true},    // 2
+	{"UTF32", false},   // 3
+	{"UTF32le", false}, // 4
+}
+
+func TestSupportedEncoder(t *testing.T) {
+	for i, d := range dSupportedEncoders {
+		r := SupportedEncoder(d.cpn)
+		assert.Equal(t, d.res, r, fmt.Sprintf("test #%d not pass", i))
+	}
+}
--- a/linter.md
+++ b/linter.md
@ -1,13 +1,30 @@
-cpTable.go:33:19: func `(*cpTable).sort` is unused (unused)
-utf8.go:113:6: func `toUTF8` is unused (unused)
-code_pages.go:171:24: func `TCodepagesDic.clear` is unused (unused)
-cpTable.go:27:19: func `(*cpTable).clear` is unused (unused)
-sample\main.go:16:14: Error return value of `FindFilesExt` is not checked (errcheck)
-	FindFilesExt(&fl, ".\\", os.Args[1])
-	            ^
-cpd_test.go:183:9: Error return value of `os.Link` is not checked (errcheck)
+utf8.go:113:6: `toUTF8` is unused (deadcode)
+func toUTF8(s string) string {
+     ^
+cpd_test.go:235:6: `tDecodeUTF16be` is unused (deadcode)
+type tDecodeUTF16be struct {
+     ^
+code_pages.go:60:9: Error return value of `r.Read` is not checked (errcheck)
+		r.Read(make([]byte, UTF8.BomLen())) // считываем в никуда количество байт занимаемых BOM этой кодировки
+		      ^
+cpd.go:183:17: Error return value of `tmpReader.Read` is not checked (errcheck)
+		tmpReader.Read(make([]byte, cp.BomLen())) // считываем в никуда количество байт занимаемых BOM этой кодировки
+		              ^
+cpd_test.go:184:9: Error return value of `os.Link` is not checked (errcheck)
 	os.Link(fp.Join("test_files/866to1251.txt"), fp.Join("test_files/866to1251.tmp"))
 	       ^
-cpd_test.go:70:2: `st` is unused (structcheck)
+sample\detect-all-files\main.go:18:14: Error return value of `FindFilesExt` is not checked (errcheck)
+	FindFilesExt(&fl, ".\\", os.Args[1])
+	            ^
+cpd_test.go:236:2: `iStr` is unused (structcheck)
+	iStr string
+	^
+cpd_test.go:237:2: `oStr` is unused (structcheck)
+	oStr string
+	^
+cpd_test.go:72:2: `st` is unused (structcheck)
 	st string     //stop string, not using now
 	^
+cpTable.go:33:19: func `(*cpTable).sort` is unused (unused)
+code_pages.go:210:24: func `TCodepagesDic.clear` is unused (unused)
+cpTable.go:27:19: func `(*cpTable).clear` is unused (unused)
--- a/sample/cpname/main.go
+++ b/sample/cpname/main.go
@ -0,0 +1,22 @@
+package main
+
+import (
+	"fmt"
+
+	"github.com/softlandia/cpd"
+
+	hi "golang.org/x/text/encoding/htmlindex"
+)
+
+func main() {
+	fmt.Printf("'github.com/softlandia/cpd' => 'golang.org/x/text/encoding/htmlindex'\n")
+	for cp := range cpd.NewCodepageDic() {
+		e, err := hi.Get(cp.String())
+		if err == nil {
+			name, _ := hi.Name(e)
+			fmt.Printf("%s => %s\n", cp, name)
+		} else {
+			fmt.Printf("%s not present in 'htmlindex'\n", cp)
+		}
+	}
+}
--- a/sample/detect-all-files/866&1251&KOI8r&iso8859-5.txt
+++ b/sample/detect-all-files/866&1251&KOI8r&iso8859-5.txt
--- a/sample/detect-all-files/866&1251.txt
+++ b/sample/detect-all-files/866&1251.txt
@ -0,0 +1,8 @@
+   з «® 866 
+~X~
+пример 1251 ТОЧНО ЖУЙ эти булочки
+~A 
+<OK>
+ё Ё
+<OK>
+
--- a/sample/detect-all-files/866-table.txt
+++ b/sample/detect-all-files/866-table.txt
@ -0,0 +1,15 @@
+ÚÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄ¿
+³   <20>à¨¬¥à ¢ à ¬ª¥                                    ³
+³                                                     ³
+³   ÉÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍ»         ³
+³   º                                       º         ³
+³   º                                       º         ³
+³   º                                       º         ³
+³   º                                       º         ³
+³   º                                       º         ³
+³   º                                       º         ³
+³   ÈÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍ¼         ³
+³                                                     ³
+³                                                     ³
+ÀÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÙ
+
--- a/sample/detect-all-files/866to1251.txt
+++ b/sample/detect-all-files/866to1251.txt
@ -0,0 +1 @@
+¯à¨¬¥à ä ©«  ¢ ª®¤¨à®¢ª¥ 866 „‹Ÿ ¯à®¢¥àª¨ ª®¢¥àâ æ¨¨ äãªæ¨¥© <FileConvertCodePage()>
--- a/sample/detect-all-files/IBM866.txt
+++ b/sample/detect-all-files/IBM866.txt
@ -0,0 +1 @@
+<EFBFBD>ãááª¨© ¢ ª®¤¨à®¢ª¥      IBM866
--- a/sample/detect-all-files/ISO8859-5.txt
+++ b/sample/detect-all-files/ISO8859-5.txt
@ -0,0 +1 @@
+Пример русского в кодировке ISO8859-5
--- a/sample/detect-all-files/KOI8-r.txt
+++ b/sample/detect-all-files/KOI8-r.txt
@ -0,0 +1 @@
+Русский в кодировке       KOI8r
--- a/sample/detect-all-files/KOI8-r2.txt
+++ b/sample/detect-all-files/KOI8-r2.txt
@ -0,0 +1 @@
+РУССКИЙ В КОДИРОВКЕ KOI8-r
--- a/sample/detect-all-files/Win1251.txt
+++ b/sample/detect-all-files/Win1251.txt
@ -0,0 +1 @@
+Русский в кодировке Windows1251
--- a/sample/detect-all-files/empty_file.txt
+++ b/sample/detect-all-files/empty_file.txt
--- a/sample/detect-all-files/main.go
+++ b/sample/detect-all-files/main.go
@ -11,6 +11,8 @@ import (
 	"github.com/softlandia/cpd"
 )

+//выводит кодировку всех файлов с указанным расширением
+//пример запуска: >detect-all-files .txt
 func main() {
 	var fl []string
 	FindFilesExt(&fl, ".\\", os.Args[1])
--- a/sample/detect-all-files/noCodePage.txt
+++ b/sample/detect-all-files/noCodePage.txt
@ -0,0 +1 @@
+0
--- a/sample/detect-all-files/rune_encode_error.txt
+++ b/sample/detect-all-files/rune_encode_error.txt
@ -0,0 +1,2 @@
+1234567890
+abcｰ
--- a/sample/detect-all-files/rune_error_1251.txt
+++ b/sample/detect-all-files/rune_error_1251.txt
@ -0,0 +1,2 @@
+1234567890 Привет
+abc°  #x
--- a/sample/detect-all-files/utf16be-wBOM.txt
+++ b/sample/detect-all-files/utf16be-wBOM.txt
--- a/sample/detect-all-files/utf16be-woBOM-no-ru.txt
+++ b/sample/detect-all-files/utf16be-woBOM-no-ru.txt
--- a/sample/detect-all-files/utf16be-woBOM-only-latin.txt
+++ b/sample/detect-all-files/utf16be-woBOM-only-latin.txt
--- a/sample/detect-all-files/utf16be-woBOM-only-ru.txt
+++ b/sample/detect-all-files/utf16be-woBOM-only-ru.txt
--- a/sample/detect-all-files/utf16be-woBOM.txt
+++ b/sample/detect-all-files/utf16be-woBOM.txt
--- a/sample/detect-all-files/utf16be_las.txt
+++ b/sample/detect-all-files/utf16be_las.txt
--- a/sample/detect-all-files/utf16le-wBOM.txt
+++ b/sample/detect-all-files/utf16le-wBOM.txt
--- a/sample/detect-all-files/utf16le-woBOM-no-ru.txt
+++ b/sample/detect-all-files/utf16le-woBOM-no-ru.txt
--- a/sample/detect-all-files/utf16le-woBOM-only-latin.txt
+++ b/sample/detect-all-files/utf16le-woBOM-only-latin.txt
--- a/sample/detect-all-files/utf16le-woBOM-only-ru.txt
+++ b/sample/detect-all-files/utf16le-woBOM-only-ru.txt
--- a/sample/detect-all-files/utf16le-woBOM.txt
+++ b/sample/detect-all-files/utf16le-woBOM.txt
--- a/sample/detect-all-files/utf16le_las.txt
+++ b/sample/detect-all-files/utf16le_las.txt
--- a/sample/detect-all-files/utf32be-ascii-no-ru.txt
+++ b/sample/detect-all-files/utf32be-ascii-no-ru.txt
--- a/sample/detect-all-files/utf32be-wBOM.txt
+++ b/sample/detect-all-files/utf32be-wBOM.txt
--- a/sample/detect-all-files/utf32be-woBOM.txt
+++ b/sample/detect-all-files/utf32be-woBOM.txt
--- a/sample/detect-all-files/utf32le-ascii-no-ru.txt
+++ b/sample/detect-all-files/utf32le-ascii-no-ru.txt
--- a/sample/detect-all-files/utf32le-wBOM.txt
+++ b/sample/detect-all-files/utf32le-wBOM.txt
--- a/sample/detect-all-files/utf32le-woBOM.txt
+++ b/sample/detect-all-files/utf32le-woBOM.txt
--- a/sample/detect-all-files/utf8-wBOM.txt
+++ b/sample/detect-all-files/utf8-wBOM.txt
@ -0,0 +1 @@
+Русский в кодировке        UTF8
--- a/sample/detect-all-files/utf8-woBOM.txt
+++ b/sample/detect-all-files/utf8-woBOM.txt
@ -0,0 +1 @@
+Русский в кодировке        UTF8 ンラ на японском
--- a/sample/detect-all-files/utf8.txt
+++ b/sample/detect-all-files/utf8.txt
@ -0,0 +1 @@
+Utf8 w/o bom Русский
--- a/sample/detect-all-files/win1251_upper.txt
+++ b/sample/detect-all-files/win1251_upper.txt
@ -0,0 +1 @@
+РУССКИЙ В КОДИРОВКЕ 1251
--- a/sample/tohex/main.go
+++ b/sample/tohex/main.go
@ -0,0 +1,39 @@
+package main
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"strings"
+
+	"github.com/softlandia/cpd"
+)
+
+func main() {
+	if len(os.Args) == 1 {
+		fmt.Printf("using: 'tohex data CPXXX'\n")
+		os.Exit(0)
+	}
+	//Проверим что целевая кодировка поддерживается
+	if !cpd.SupportedEncoder(os.Args[2]) {
+		fmt.Printf("landing codepage: '%s' not supported\n", os.Args[2])
+		os.Exit(0)
+	}
+	//создаём новый ридер, при чтении из него будем получать уже в новой кодровке
+	r, err := cpd.NewReaderTo(strings.NewReader(os.Args[1]), os.Args[2])
+	if err != nil {
+		fmt.Printf("cpd.NewReaderTo return error: '%v' \n", err)
+		os.Exit(0)
+	}
+	//читаем данные
+	b, _ := ioutil.ReadAll(r)
+	var sb strings.Builder
+	//собираем строку содержащую шестнадцатеричные коды символов в целевой кодировке
+	sb.WriteString("\"")
+	for _, r := range b {
+		sb.WriteString("\\x")
+		sb.WriteString(fmt.Sprintf("%X", r))
+	}
+	sb.WriteString("\"")
+	fmt.Printf("%s", sb.String())
+}
--- a/test_files/KOI8-r2.txt
+++ b/test_files/KOI8-r2.txt
@ -1 +1 @@
-ПУССКИЙ В КОДИРОВКЕ KOI8-r
+РУССКИЙ В КОДИРОВКЕ KOI8-r
--- a/utf16be.go
+++ b/utf16be.go
@ -1,11 +1,33 @@
 package cpd

 import (
+	"bytes"
 	"encoding/binary"
+	"unicode/utf16"
+	"unicode/utf8"
 )

 //unit for UTF16BE

+// DecodeUTF16be - decode slice of byte from UTF16 to UTF8
+func DecodeUTF16be(s string) string {
+	if len(s) == 0 {
+		return ""
+	}
+	s = UTF16BE.DeleteBom(s)
+	b := []byte(s)
+	u16s := make([]uint16, 1)
+	ret := &bytes.Buffer{}
+	b8buf := make([]byte, 4)
+	for i := 0; i < len(b); i += 2 {
+		u16s[0] = uint16(b[i+1]) + (uint16(b[i]) << 8)
+		r := utf16.Decode(u16s)
+		n := utf8.EncodeRune(b8buf, r[0])
+		ret.Write(b8buf[:n])
+	}
+	return ret.String()
+}
+
 func matchUTF16be(b []byte, tbl *cpTable) MatchRes {
 	n := len(b)/2 - 1
 	if n <= 0 {
--- a/utf16le.go
+++ b/utf16le.go
@ -1,16 +1,37 @@
 package cpd

 import (
+	"bytes"
 	"encoding/binary"
+	"unicode/utf16"
+	"unicode/utf8"
 )

 //unit for UTF16LE
-//проверка на BOM уже выполнена, в принимаемом массиве нет BOM символов

-//русские буквы в UTF16 имеют уникальные номера
-//определять кодировку UTF16 (как LE так и BE) нужно по внутреннему устройству, не по кодам русских букв
+// DecodeUTF16le - decode slice of byte from UTF16 to UTF8
+func DecodeUTF16le(s string) string {
+	if len(s) == 0 {
+		return ""
+	}
+	s = UTF16LE.DeleteBom(s)
+	b := []byte(s)
+	u16s := make([]uint16, 1)
+	ret := &bytes.Buffer{}
+	b8buf := make([]byte, 4)
+	for i := 0; i < len(b); i += 2 {
+		u16s[0] = uint16(b[i]) + (uint16(b[i+1]) << 8)
+		r := utf16.Decode(u16s)
+		n := utf8.EncodeRune(b8buf, r[0])
+		ret.Write(b8buf[:n])
+	}
+	return ret.String()
+}

 // matchUTF16le - функция вычисляет общий критерий для кодировки UTF16LE
+// проверка на BOM уже выполнена, в принимаемом массиве нет BOM символов
+// русские буквы в UTF16 имеют уникальные номера
+// определять кодировку UTF16 (как LE так и BE) нужно по внутреннему устройству, не по кодам русских букв
 // два критерия используется
 // первый количество найденных русских букв
 // второй количество найденных 0x00
@ -18,7 +39,7 @@ import (
 func matchUTF16le(b []byte, tbl *cpTable) MatchRes {
 	n := len(b)/2 - 1
 	if n <= 0 {
-		return MatchRes{0, 0}
+		return MatchRes{0, 0} // too short
 	}
 	return MatchRes{max(matchUTF16leRu(b, tbl), matchUTF16leZerro(b)), 0}
 }
--- a/utf32be.go
+++ b/utf32be.go
@ -2,7 +2,7 @@ package cpd

 //UTF-32BE

-//первые 2 байта практически всегда меньше вторых 2 байтов
+//первые 2 байта практически всегда равны 0
 func matchUTF32be(d []byte, tbl *cpTable) MatchRes {
 	zerroCounts := 0
 	for i := 0; i < len(d)-4; i += 4 {
				`@ -0,0 +1 @@`
				`¯à¨¬¥à ä ©« ¢ ª®¤¨à®¢ª¥ 866 „‹Ÿ ¯à®¢¥àª¨ ª®¢¥àâ æ¨¨ äãªæ¨¥© <FileConvertCodePage()>`
				`@ -0,0 +1 @@`
				`<EFBFBD>ãááª¨© ¢ ª®¤¨à®¢ª¥ IBM866`
				`@ -0,0 +1 @@`
				`Пример русского в кодировке ISO8859-5`
				`@ -0,0 +1 @@`
				`Русский в кодировке Windows1251`
				`@ -0,0 +1 @@`
				`Русский в кодировке UTF8 ンラ на японском`