v0.2.2 add UTF32 with BOM

2019-11-21 20:53:49 +04:00 · 2019-11-21 20:53:49 +04:00 · fbe3be3cbc
--- a/CheckBom.go
+++ b/CheckBom.go
@ -0,0 +1,46 @@
+package cpd
+
+import "bytes"
+
+// Boms - byte oder mark - special bytes for
+var Boms = []struct {
+	bom []byte
+	id  IDCodePage
+}{
+	{[]byte{0xef, 0xbb, 0xbf}, UTF8},
+	{[]byte{0x00, 0x00, 0xfe, 0xff}, UTF32BE},
+	{[]byte{0xff, 0xfe, 0x00, 0x00}, UTF32LE},
+	{[]byte{0xfe, 0xff}, UTF16BE},
+	{[]byte{0xff, 0xfe}, UTF16LE},
+}
+
+//CheckBOM - check buffer for match to utf-8, utf-16le or utf-16be BOM
+func CheckBOM(buf []byte) (id IDCodePage, res bool) {
+	for _, b := range Boms {
+		if bytes.HasPrefix(buf, b.bom) {
+			return b.id, true
+		}
+	}
+	return ASCII, false
+}
+
+func bomUTF8(b []byte) bool {
+	return (len(b) > 3) && (b[0] == 0xEF) && (b[1] == 0xBB) && (b[2] == 0xBF)
+}
+
+func bomUTF16le(b []byte) bool {
+	return (len(b) > 2) && (b[0] == 0xFF) && (b[1] == 0xFE)
+}
+
+func bomUTF16be(b []byte) bool {
+	return (len(b) > 2) && (b[0] == 0xFE) && (b[1] == 0xFF)
+}
+
+//ASCII block
+func itASCII(r rune, tbl *codePageTable) int {
+	return 0
+}
+
+func runesMatchASCII(b []byte, tbl *codePageTable) int {
+	return 0
+}
--- a/README-RU.md
+++ b/README-RU.md
@ -0,0 +1,74 @@
+# code page detect #
+
+(c) softlandia@gmail.com
+
+>download: go get -u github.com/softlandia/cpd  
+>install: go install
+
+библиотека на golang
+
+предназначена для автоматического определения кодовой страницы текстовых файлов или потоков байт  
+поддерживает следующие кодовые страницы:
+
+no ID                Name
+
+1. ASCII:            "ASCII",
+2. ISOLatinCyrillic: "ISO-8859-5",
+3. CP866:            "CP866",
+4. Windows1251:      "Windows-1251",
+5. UTF8:             "UTF-8",
+6. UTF16LE:          "UTF-16LE",
+7. UTF16BE:          "UTF-16BE",
+8. UTF32:            "UTF-32",
+9. KOI8R:            "KOI8-R",
+10. Unicode:          "Unicode",
+11. UTF7:             "UTF-7",
+12. UTF32LE:          "UTF-32LE",
+13. UTF32BE:          "UTF-32BE",
+
+## особенности ##
+
+если данные содержат только латинские символы (первая половина ASCII таблицы) будет определена кодировка UTF-8  
+это не является ошибкой, поскольку такой файл или данные действительно можно использовать как UTF-8
+
+при использовании golang 1.12.6 в проект добавляется код размером ~240 kB
+
+## зависимости ##
+
+>"golang.org/x/text/encoding/charmap"  
+>"golang.org/x/text/transform"
+
+## типы ##
+
+IDCodePage uint16 - индекс кодовой страницы, значения взяты из файла поставки golang golang.org\x\text\encoding\internal\identifier\mib.go
+поддерживается interface String(), и можно выводить так
+    cp := cpd.UTF8
+    fmt.Printf("code page index, name: %d, %s\n", cp, cp)
+    //>code page index, name: 106, UTF-8
+
+## глобальные переменные ##
+
+ReadBufSize int = 1024 // количество байт считываемых из ридера (буфера) для определения кодировки
+
+## функции ##
+
+1. CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error)
+2. FileCodePageDetect(fn string, stopStr ...string) (IDCodePage, error)
+
+## описание ##
+
+    CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error)
+      определяет кодовую страницу считывая поток байтов из 'r' 
+      используется 'reflect.ValueOf(r).IsValid()' для проверки 'r' на существование
+      считывает из 'r' первые ReadBufSize байтов
+      параметр stopStr пока не используется
+
+    FileCodePageDetect(fn string, stopStr ...string) (IDCodePage, error)
+      определяет кодовую страницу считывая файл 'fn', считывает из файла первые ReadBufSize байтов
+      ошибку возвращает если проблемы с открытием файла 'fn'
+      возвращает cpd.ASCII если колировка не определена
+
+## tests ##
+
+coverage: 84.0% of statements  
+в папке "test_files" лежат файлы для тестов, соответственно не править и не удалять
--- a/README.md
+++ b/README.md
@ -17,7 +17,8 @@ support russian code page:
 7. UTF-8
 8. ISO8859-5

-### feature ###
+## feature ##
+
 if file contain only latin symbols, this file detected as UTF-8  
 this is not a mistake, this is a completely correct statement

@ -32,6 +33,10 @@ on go vertion 1.12.6 add to exe 240 kB

 IDCodePage uint16 - index of code page, support String() interface, you can fmt.Printf("code page index, name: %d, %s\n", cp, cp) where var cp received from cpd functions

+## variables ##
+
+ReadBufSize int = 1024 // count of byte to read from input reader by default
+
 ## functions ##

 1. CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error)
@ -43,6 +48,9 @@ IDCodePage uint16 - index of code page, support String() interface, you can fmt.

    CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error)
      detect code page of ascii data from reader 'r' 
+      use library 'reflect' to check input reader
+      default read only first 1024 byte from 'r' (var ReadBufSize to change this setting)
+      input parameter stopStr not using

    FileCodePageDetect(fn string, stopStr ...string) (IDCodePage, error)
      detect code page of text file "fn", read first 1024 byte (var ReadBufSize to change this setting)
--- a/code_pages.go
+++ b/code_pages.go
@ -17,6 +17,7 @@ func (i IDCodePage) String() string {
 }

 //itRuneMatch - return 1 if rune from this code page, 0 else
+// function exist in every CodePage
 type itRuneMatch func(r rune, tbl *codePageTable) int

 //runesMatch - return count of entry elements of data to code page
@ -32,6 +33,7 @@ type tableElement struct {
 type codePageTable [19]tableElement

 //MatchRes - итоговый критерий совпадения массива данных с кодовой страницей
+// возможно в дальнейшем усложнится
 type MatchRes struct {
 	countMatch int
 }
@ -61,14 +63,95 @@ func (o CodePage) MatchingRunes() string {
 	return sb.String()
 }

+//TCodepagesDic - type to store all supported code page
+type TCodepagesDic map[IDCodePage]CodePage
+
+//Match - return the id of code page to which the data best matches
+func (o TCodepagesDic) Match(data []byte) (result IDCodePage) {
+	result = ASCII
+	maxCount := 0
+	for id, cp := range o {
+		cp.countMatch = cp.match(data, &cp.table)
+		o[id] = cp
+		if cp.countMatch > maxCount {
+			maxCount = cp.countMatch
+			result = id
+		}
+	}
+	return result
+}
+
+//CodepageDic -
+var CodepageDic = TCodepagesDic{
+	ASCII: {ASCII, "ASCII", MatchRes{0}, runesMatchASCII,
+		codePageTable{{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}},
+
+	CP866: {CP866, "CP866", MatchRes{0}, runesMatch866,
+		codePageTable{
+			//first element serves as sign of absence
+			{0, 0},
+			//о          е		   а		  и			 н			т			с		  р			в
+			{0xAE, 0}, {0xA5, 0}, {0xA0, 0}, {0xA8, 0}, {0xAD, 0}, {0xE2, 0}, {0xE1, 0}, {0xE0, 0}, {0xA2, 0},
+			{0x8E, 0}, {0x85, 0}, {0x80, 0}, {0x88, 0}, {0x8D, 0}, {0x92, 0}, {0x91, 0}, {0x90, 0}, {0x82, 0}}},
+	UTF8: {UTF8, "UTF8", MatchRes{0}, runesMatchUTF8,
+		codePageTable{
+			{0, 0},
+			//о           е				а		    и			 н			  т			   с			р			в
+			{0xD0BE, 0}, {0xD0B5, 0}, {0xD0B0, 0}, {0xD0B8, 0}, {0xD0BD, 0}, {0xD182, 0}, {0xD181, 0}, {0xD180, 0}, {0xD0B2, 0},
+			{0xD09E, 0}, {0xD095, 0}, {0xD090, 0}, {0xD098, 0}, {0xD0AD, 0}, {0xD0A2, 0}, {0xD0A1, 0}, {0xD0A0, 0}, {0xD092, 0}}},
+	Windows1251: {Windows1251, "Windows1251", MatchRes{0}, runesMatch1251,
+		codePageTable{
+			{0, 0},
+			//а		    и		   н		  с			 р			в		   л		  к			в
+			{0xE0, 0}, {0xE8, 0}, {0xED, 0}, {0xF1, 0}, {0xF0, 0}, {0xE2, 0}, {0xEB, 0}, {0xEA, 0}, {0xE2, 0},
+			{0xC0, 0}, {0xC8, 0}, {0xCD, 0}, {0xD1, 0}, {0xD0, 0}, {0xC2, 0}, {0xCB, 0}, {0xCA, 0}, {0xC2, 0}}},
+	KOI8R: {KOI8R, "KOI8R", MatchRes{0}, runesMatchKOI8,
+		codePageTable{
+			//о		    а		   и		  т			 с			в		   л		  к			м
+			{0, 0},
+			{0xCF, 0}, {0xC1, 0}, {0xC9, 0}, {0xD4, 0}, {0xD3, 0}, {0xD7, 0}, {0xCC, 0}, {0xCB, 0}, {0xCD, 0},
+			{0xEF, 0}, {0xE1, 0}, {0xE9, 0}, {0xF4, 0}, {0xF3, 0}, {0xF7, 0}, {0xEC, 0}, {0xEB, 0}, {0xED, 0}}},
+	ISOLatinCyrillic: {ISOLatinCyrillic, "ISO-8859-5", MatchRes{0}, runesMatchISO88595,
+		codePageTable{
+			//о		    а		   и		  т			 с			в		   л		  к			е
+			{0, 0},
+			{0xDE, 0}, {0xD0, 0}, {0xD8, 0}, {0xE2, 0}, {0xE1, 0}, {0xD2, 0}, {0xDB, 0}, {0xDA, 0}, {0xD5, 0},
+			{0xBF, 0}, {0xB0, 0}, {0xB8, 0}, {0xC2, 0}, {0xC1, 0}, {0xB2, 0}, {0xBB, 0}, {0xBA, 0}, {0xB5, 0}}},
+	UTF16LE: {UTF16LE, "UTF16LE", MatchRes{0}, runesMatchUTF16LE,
+		codePageTable{
+			{0, 0},
+			//о           е				а		    и			 н			  т			   с			р			в
+			{0x3E04, 0}, {0x3504, 0}, {0x1004, 0}, {0x3804, 0}, {0x3D04, 0}, {0x4204, 0}, {0x4104, 0}, {0x4004, 0}, {0x3204, 0},
+			{0x1E04, 0}, {0x1504, 0}, {0x3004, 0}, {0x1804, 0}, {0x1D04, 0}, {0x2204, 0}, {0x2104, 0}, {0x2004, 0}, {0x1204, 0}}},
+	UTF16BE: {UTF16BE, "UTF16BE", MatchRes{0}, runesMatchUTF16BE,
+		codePageTable{
+			{0, 0},
+			//о           е				а		    и			 н			  т			   с			р			в
+			{0x043E, 0}, {0x0435, 0}, {0x0410, 0}, {0x0438, 0}, {0x043D, 0}, {0x0442, 0}, {0x0441, 0}, {0x0440, 0}, {0x0432, 0},
+			{0x041E, 0}, {0x0415, 0}, {0x0430, 0}, {0x0418, 0}, {0x041D, 0}, {0x0422, 0}, {0x0421, 0}, {0x0420, 0}, {0x0412, 0}}},
+}
+
+//codePageName - string of code page name
+var codePageName = map[IDCodePage]string{
+	ASCII:            "ASCII",
+	ISOLatinCyrillic: "ISO-8859-5",
+	CP866:            "CP866",
+	Windows1251:      "Windows-1251",
+	UTF8:             "UTF-8",
+	UTF16LE:          "UTF-16LE",
+	UTF16BE:          "UTF-16BE",
+	UTF32:            "UTF-32",
+	KOI8R:            "KOI8-R",
+	Unicode:          "Unicode",
+	UTF7:             "UTF-7",
+	UTF32LE:          "UTF-32LE",
+	UTF32BE:          "UTF-32BE",
+}
+
+/*
 //TCodePages - type for store all code page
 type TCodePages []CodePage

-//DeepMach -
-func (o *TCodePages) DeepMach(data []byte) IDCodePage {
-	return ASCII
-}
-
 //Match - return IDCodePage
 //simple calculate count entry data runes in standart code page table
 func (o TCodePages) Match(data []byte) (result IDCodePage) {
@ -84,6 +167,11 @@ func (o TCodePages) Match(data []byte) (result IDCodePage) {
 	return result
 }

+//DeepMach -
+func (o *TCodePages) DeepMach(data []byte) IDCodePage {
+	return ASCII
+}
+
 //CodePages - slice of code pages
 var CodePages = TCodePages{
 	{ASCII, "ASCII", MatchRes{0}, runesMatchASCII,
@ -132,17 +220,4 @@ var CodePages = TCodePages{
 			{0x043E, 0}, {0x0435, 0}, {0x0410, 0}, {0x0438, 0}, {0x043D, 0}, {0x0442, 0}, {0x0441, 0}, {0x0440, 0}, {0x0432, 0},
 			{0x041E, 0}, {0x0415, 0}, {0x0430, 0}, {0x0418, 0}, {0x041D, 0}, {0x0422, 0}, {0x0421, 0}, {0x0420, 0}, {0x0412, 0}}},
 }
-
-//codePageName - string of code page name
-var codePageName = map[IDCodePage]string{
-	ASCII:            "ASCII",
-	ISOLatinCyrillic: "ISO-8859-5",
-	IBM866:           "IBM866",
-	Windows1251:      "Windows1251",
-	UTF8:             "UTF8",
-	UTF16LE:          "UTF16LE",
-	UTF16BE:          "UTF16BE",
-	UTF32:            "UTF32",
-	KOI8R:            "KOI8R",
-	Unicode:          "Unicode",
-}
+*/
--- a/code_pages_id.go
+++ b/code_pages_id.go
@ -72,10 +72,10 @@ const (
 	// Reference: RFC1489
 	KOI8R IDCodePage = 2084

-	// IBM866 is the uint16 identifier with IANA name IBM866.
+	// CP866 is the uint16 identifier with IANA name IBM866.
 	//
 	// IBM NLDG Volume 2 (SE09-8002-03) August 1994
-	IBM866 IDCodePage = 2086
+	CP866 IDCodePage = 2086

 	// Windows1251 is the uint16 identifier with IANA name windows-1251.
 	//
--- a/cp_deep_maching.go
+++ b/cp_deep_maching.go
@ -1,45 +0,0 @@
-package cpd
-
-//checkHeader - check buffer for match to utf-8, utf-16le or utf-16be BOM
-func checkHeader(b []byte) (id IDCodePage, res bool) {
-	if bomUTF8(b) {
-		return UTF8, true
-	}
-	if bomUTF16le(b) {
-		return UTF16LE, true
-	}
-	if bomUTF16be(b) {
-		return UTF16BE, true
-	}
-	return ASCII, false
-}
-
-func bomUTF8(b []byte) bool {
-	if len(b) < 3 {
-		return false
-	}
-	return (b[0] == 0xEF) && (b[1] == 0xBB) && (b[2] == 0xBF)
-}
-
-func bomUTF16le(b []byte) bool {
-	if len(b) < 2 {
-		return false
-	}
-	return (b[0] == 0xFF) && (b[1] == 0xFE)
-}
-
-func bomUTF16be(b []byte) bool {
-	if len(b) < 2 {
-		return false
-	}
-	return (b[0] == 0xFE) && (b[1] == 0xFF)
-}
-
-//ASCII block
-func itASCII(r rune, tbl *codePageTable) int {
-	return 0
-}
-
-func runesMatchASCII(b []byte, tbl *codePageTable) int {
-	return 0
-}
--- a/cpd.go
+++ b/cpd.go
@ -1,7 +1,5 @@
 //Package cpd - code page detect
 // (c) 2019 softlandia@gmail.com
-// v0.1.0
-// 01/oct/2019
 package cpd

 import (
@ -18,9 +16,19 @@ import (
 //ReadBufSize - byte count for reading from file, func FileCodePageDetect()
 var ReadBufSize int = 1024

-//CodePageAutoDetect - auto detect code page of input content
-func CodePageAutoDetect(content []byte) (result IDCodePage) {
-	return CodePages.Match(content)
+//FileCodePageDetect - detect code page of text file
+func FileCodePageDetect(fn string, stopStr ...string) (IDCodePage, error) {
+
+	iFile, err := os.Open(fn)
+	if err != nil {
+		return ASCII, err
+	}
+	defer iFile.Close()
+
+	if len(stopStr) > 0 {
+		return CodePageDetect(iFile, stopStr[0])
+	}
+	return CodePageDetect(iFile)
 }

 //CodePageDetect - detect code page of ascii data from reader 'r'
@ -37,45 +45,35 @@ func CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error) {
 		return ASCII, err
 	}

-	//check file header // utf-8, utf-16 with BOM
-	if idCodePage, ok := checkHeader(buf); ok {
+	//is buf contains the BOM of utf-8, utf-16le or utf-16be
+	if idCodePage, ok := CheckBOM(buf); ok {
 		return idCodePage, nil
 	}

-	//check data for UTF
-	if IsUtf8(buf) {
+	if ValidUTF8(buf) {
 		return UTF8, nil
 	}

 	return CodePageAutoDetect(buf), nil
 }

-//FileCodePageDetect - detect code page of text file
-func FileCodePageDetect(fn string, stopStr ...string) (IDCodePage, error) {
-
-	iFile, err := os.Open(fn)
-	if err != nil {
-		return ASCII, err
-	}
-	defer iFile.Close()
-
-	if len(stopStr) > 0 {
-		return CodePageDetect(iFile, stopStr[0])
-	}
-	return CodePageDetect(iFile)
+//CodePageAutoDetect - auto detect code page of input content
+func CodePageAutoDetect(content []byte) (result IDCodePage) {
+	return CodepageDic.Match(content) //TODO большинству матчеров требуется более 2х символов, надо проверить на минимальную длину
 }

 //FileConvertCodePage - replace code page text file from one to another
+// support convert only from/to Windows1251/IBM866
 func FileConvertCodePage(fileName string, fromCP, toCP IDCodePage) error {
 	if fromCP == toCP {
 		return nil
 	}

-	if (fromCP != Windows1251) && (fromCP != IBM866) {
+	if (fromCP != Windows1251) && (fromCP != CP866) {
 		return nil
 	}

-	if (toCP != Windows1251) && (toCP != IBM866) {
+	if (toCP != Windows1251) && (toCP != CP866) {
 		return nil
 	}

@ -101,7 +99,7 @@ func FileConvertCodePage(fileName string, fromCP, toCP IDCodePage) error {
 		if err != nil {
 			oFile.Close()
 			os.Remove(tmpFileName)
-			return fmt.Errorf("cde page convert error on file '%s': %v", fileName, err)
+			return fmt.Errorf("code page convert error on file '%s': %v", fileName, err)
 		}
 		fmt.Fprintf(oFile, "%s\n", s)
 	}
@ -110,7 +108,14 @@ func FileConvertCodePage(fileName string, fromCP, toCP IDCodePage) error {
 	return os.Rename(tmpFileName, fileName)
 }

+//ToUTF8 -
+//TODO need realization
+func ToUTF8(s string) string {
+	return s
+}
+
 //StrConvertCodePage - convert string from one code page to another
+// function for future, at now support convert only from/to Windows1251/IBM866
 func StrConvertCodePage(s string, fromCP, toCP IDCodePage) (string, error) {
 	if len(s) == 0 {
 		return "", nil
@ -122,13 +127,13 @@ func StrConvertCodePage(s string, fromCP, toCP IDCodePage) (string, error) {
 	var err error

 	switch fromCP {
-	case IBM866:
+	case CP866:
 		s, _, err = transform.String(charmap.CodePage866.NewDecoder(), s)
 	case Windows1251:
 		s, _, err = transform.String(charmap.Windows1251.NewDecoder(), s)
 	}
 	switch toCP {
-	case IBM866:
+	case CP866:
 		s, _, err = transform.String(charmap.CodePage866.NewEncoder(), s)
 	case Windows1251:
 		s, _, err = transform.String(charmap.Windows1251.NewEncoder(), s)
--- a/cpd_test.go
+++ b/cpd_test.go
@ -13,8 +13,8 @@ type tCodePageAsString struct {
 var dCodePageAsString = []tCodePageAsString{
 	{0, ""},
 	{3, "ASCII"},
-	{IBM866, "IBM866"},
-	{Windows1251, "Windows1251"},
+	{CP866, "CP866"},
+	{Windows1251, "Windows-1251"},
 	{60000, ""},
 }

@ -27,6 +27,61 @@ func TestCodePageAsString(t *testing.T) {
 	}
 }

+type tFileCodePageDetectTest struct {
+	fn string     //filename
+	st string     //stop string
+	e  error      //
+	r  IDCodePage //expected result
+}
+
+var dFileCodePageDetect = []tFileCodePageDetectTest{
+	{"test_files\\utf32le-wBOM.txt", "", nil, UTF32LE},        //file contain utf32 little endian with bom
+	{"test_files\\KOI8-r.txt", "", nil, KOI8R},                //file contain KOI8
+	{"test_files\\IBM866.txt", "", nil, CP866},                //file contain IBM866
+	{"test_files\\Win1251.txt", "", nil, Windows1251},         //file contain Windows1251
+	{"test_files\\utf8-woBOM.txt", "", nil, UTF8},             //file contain utf8 with out bom rune at start
+	{"test_files\\866&1251.txt", "", nil, Windows1251},        //file contain more 1251 then 866
+	{"test_files\\noCodePage.txt", "", nil, UTF8},             //file contain rune only ASCII
+	{"test_files\\empty_file.txt", "", nil, UTF8},             //file exist but empty, no error, return ASCII
+	{"test_files\\rune_encode_error.txt", "", nil, ASCII},     //file contain special rune -> encode error, but detect NO error
+	{"test_files\\rune_error_1251.txt", "", nil, Windows1251}, //file contain 1251 and special rune -> encode error, but detect NO error
+	{"test_files\\utf8wbom.txt", "", nil, UTF8},               //file contain utf8 with bom prefix
+	{"test_files\\utf16LEwbom.txt", "", nil, UTF16LE},         //file contain utf16 little endian with BOM
+	{"test_files\\utf16BEwbom.txt", "", nil, UTF16BE},         //file contain utf16 big endian with BOM
+	{"test_files\\utf16le-wBOM.txt", "", nil, UTF16LE},        //file contain utf16 little endian with bom
+	{"test_files\\utf16le-woBOM.txt", "", nil, UTF16LE},       //file contain utf16 little endian without bom
+	{"test_files\\utf16be-woBOM.txt", "", nil, UTF16BE},       //file contain utf16 big endian without bom
+	{"test_files\\utf32be-wBOM.txt", "", nil, UTF32BE},        //file contain utf32 big endian with bom
+}
+
+//FileCodePageDetect
+func TestFileCodePageDetect(t *testing.T) {
+	var (
+		err error
+		res IDCodePage
+	)
+	for _, d := range dFileCodePageDetect {
+		res, err = FileCodePageDetect(d.fn)
+		if err != d.e {
+			t.Errorf("<FileCodePageDetect> on file '%s' expected error:  '%v', got: '%v', ", d.fn, d.e, err)
+		}
+		if res != d.r {
+			t.Errorf("<FileCodePageDetect> on file '%s' expected result: %s, got: %s", d.fn, d.r, res)
+		}
+	}
+
+	_, err = FileCodePageDetect("-.-") //file "-.-" not exist
+	if err == nil {
+		t.Errorf("<FileCodePageDetect> on file '-.-' must return error, but return nil")
+	}
+
+	_, err = FileCodePageDetect("") //file "" not exist
+	if err == nil {
+		t.Errorf("<FileCodePageDetect> on file '' must return error, but return nil")
+	}
+
+}
+
 //TestCodePageDetect - тестирование метода CodePageDetect
 // проверки на входные параметры:
 // 1. nil		входящий поток явный nil, параметр останова отсутствует
@ -55,8 +110,8 @@ func TestFileCodePageDetectSimple(t *testing.T) {
 	if err != nil {
 		t.Errorf("<FileCodePageDetect()> on file '866to1251.txt' err expected: nil, got: %s\n", err)
 	}
-	if res != IBM866 {
-		t.Errorf("<FileCodePageDetect()> on file '866to1251.txt' expected: %s, got: %s\n", IBM866, res)
+	if res != CP866 {
+		t.Errorf("<FileCodePageDetect()> on file '866to1251.txt' expected: %s, got: %s\n", CP866, res)
 	}
 	res, err = FileCodePageDetect("test_files\\866&1251.txt")
 	if err != nil {
@ -77,93 +132,35 @@ func TestFileCodePageDetectUtf8Bom(t *testing.T) {
 	}
 }

-type tFileCodePageDetectTest struct {
-	fn string     //filename
-	st string     //stop string
-	e  error      //
-	r  IDCodePage //expected result
-}
-
-var dFileCodePageDetect = []tFileCodePageDetectTest{
-	{"test_files\\KOI8-r.txt", "", nil, KOI8R},                //file contain KOI8
-	{"test_files\\IBM866.txt", "", nil, IBM866},               //file contain IBM866
-	{"test_files\\Win1251.txt", "", nil, Windows1251},         //file contain Windows1251
-	{"test_files\\utf16BEwbom.txt", "", nil, UTF16BE},         //file contain utf16 big endian with bom rune at start
-	{"test_files\\utf16le-wBOM.txt", "", nil, UTF16LE},        //file contain utf16 liitle endian with bom rune at start
-	{"test_files\\utf8-woBOM.txt", "", nil, UTF8},             //file contain utf8 with out bom rune at start
-	{"test_files\\866&1251.txt", "~X~", nil, Windows1251},     //befor ~X~ file contain 866, after 1251
-	{"test_files\\866&1251.txt", "", nil, Windows1251},        //file contain more 1251 then 866
-	{"test_files\\noCodePage.txt", "", nil, UTF8},             //file contain rune only ASCII
-	{"test_files\\empty_file.txt", "", nil, UTF8},             //file exist but empty, no error, return ASCII
-	{"test_files\\rune_encode_error.txt", "", nil, ASCII},     //file contain special rune -> encode error, but detect NO error
-	{"test_files\\rune_error_1251.txt", "", nil, Windows1251}, //file contain 1251 and special rune -> encode error, but detect NO error
-	{"test_files\\utf8wbom.txt", "", nil, UTF8},               //file contain utf8 with bom rune at start
-	{"test_files\\utf16LEwbom.txt", "", nil, UTF16LE},         //file contain utf16 little endian with bom rune at start
-	{"test_files\\utf16le-woBOM.txt", "", nil, UTF16LE},       //file contain utf16 liitle endian with out bom rune at start
-	{"test_files\\utf16be-woBOM.txt", "", nil, UTF16BE},       //file contain utf16 big endian with out bom rune at start
-}
-
-//FileCodePageDetect
-func TestFileCodePageDetect(t *testing.T) {
-	var (
-		err error
-		res IDCodePage
-	)
-	for _, d := range dFileCodePageDetect {
-		if len(d.st) == 0 {
-			res, err = FileCodePageDetect(d.fn)
-		} else {
-			res, err = FileCodePageDetect(d.fn, d.st)
-		}
-		if err != d.e {
-			t.Errorf("<FileCodePageDetect> on file '%s' expected error:  '%v', got: '%v', ", d.fn, d.e, err)
-		}
-		if res != d.r {
-			t.Errorf("<FileCodePageDetect> on file '%s' expected result: %s, got: %s", d.fn, d.r, res)
-		}
-	}
-
-	_, err = FileCodePageDetect("-.-") //file "-.-" not exist
-	if err == nil {
-		t.Errorf("<FileCodePageDetect> on file '-.-' must return error, but return nil")
-	}
-
-	_, err = FileCodePageDetect("") //file "" not exist
-	if err == nil {
-		t.Errorf("<FileCodePageDetect> on file '' must return error, but return nil")
-	}
-
-}
-
 //FileConvertCodePage
 func TestFileConvertCodePage(t *testing.T) {
-	err := FileConvertCodePage("", IBM866, Windows1251)
+	err := FileConvertCodePage("", CP866, Windows1251)
 	if err == nil {
 		t.Errorf("<FileConvertCodePage> on empty file name expected error, got: %v", err)
 	}

-	err = FileConvertCodePage("", IBM866, IBM866)
+	err = FileConvertCodePage("", CP866, CP866)
 	if err != nil {
 		t.Errorf("<FileConvertCodePage> on fromCp == toCp expected error==nil, got: %v", err)
 	}

-	err = FileConvertCodePage("123", UTF8, IBM866)
+	err = FileConvertCodePage("123", UTF8, CP866)
 	if err != nil {
 		t.Errorf("<FileConvertCodePage> on fromCp or toCp not Windows1251 or IBM866 expected error == nil, got: %v", err)
 	}

-	err = FileConvertCodePage("123", IBM866, UTF16LE)
+	err = FileConvertCodePage("123", CP866, UTF16LE)
 	if err != nil {
 		t.Errorf("<FileConvertCodePage> on fromCp or toCp not Windows1251 or IBM866 expected error == nil, got: %v", err)
 	}

-	err = FileConvertCodePage("test_files\\rune_encode_error.txt", IBM866, Windows1251)
+	err = FileConvertCodePage("test_files\\rune_encode_error.txt", CP866, Windows1251)
 	if err == nil {
 		t.Errorf("<FileConvertCodePage> expected error, got: %v", err)
 	}

 	os.Link("test_files\\866to1251.txt", "test_files\\866to1251.tmp")
-	err = FileConvertCodePage("test_files\\866to1251.tmp", IBM866, Windows1251)
+	err = FileConvertCodePage("test_files\\866to1251.tmp", CP866, Windows1251)
 	if err != nil {
 		t.Errorf("<FileConvertCodePage> expect no err, got: %v", err)
 	}
@ -172,19 +169,19 @@ func TestFileConvertCodePage(t *testing.T) {

 //ConvertCodePage
 func TestStrConvertCodePage(t *testing.T) {
-	_, err := StrConvertCodePage("1234", IBM866, Windows1251)
+	_, err := StrConvertCodePage("1234", CP866, Windows1251)
 	if err != nil {
 		t.Errorf("<StrConvertCodePage> on test 1 return unexpected err: %v", err)
 	}
-	_, err = StrConvertCodePage("1234", Windows1251, IBM866)
+	_, err = StrConvertCodePage("1234", Windows1251, CP866)
 	if err != nil {
 		t.Errorf("<StrConvertCodePage> on test 2 return unexpected err: %v", err)
 	}
-	_, err = StrConvertCodePage("", IBM866, Windows1251)
+	_, err = StrConvertCodePage("", CP866, Windows1251)
 	if err != nil {
 		t.Errorf("<StrConvertCodePage> with empty string must return ERROR, but retrurn: %v", err)
 	}
-	_, err = StrConvertCodePage("1234", IBM866, IBM866)
+	_, err = StrConvertCodePage("1234", CP866, CP866)
 	if err != nil {
 		t.Errorf("<StrConvertCodePage> with equal fromCP and toCp must return nil, but retrurn: %v", err)
 	}
--- a/sample/866&1251&KOI8r&iso8859-5.txt
+++ b/sample/866&1251&KOI8r&iso8859-5.txt
@ -0,0 +1,9 @@
+ з «® ў Є®¤Ёа®ўЄҐ 866 Ё Ґс Ўг¤Ґв ¬®Ј®
+~X~
+пример 1251 ТОЧНО ЖУЙ эти булочки
+~A 
+<OK>
+ё Ё
+<OK>
+Б ФЕРЕТШ KOI8 ОЕНОПЗП
+АгббЪШЩ Т ЪЮФШаЮТЪХ   ISO8859-5
--- a/sample/main.go
+++ b/sample/main.go
@ -0,0 +1,16 @@
+package main
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/softlandia/cpd"
+)
+
+func main() {
+	t, _ := cpd.FileCodePageDetect(os.Args[1])
+	fmt.Printf("cpd.FileCodePageDetect():\t%s\n", t)
+	for id, cp := range cpd.CodepageDic {
+		fmt.Printf("%s, %s\n", id, cp.MatchingRunes())
+	}
+}
--- a/sample/sample.exe
+++ b/sample/sample.exe
--- a/test_files/utf32be-wBOM.txt
+++ b/test_files/utf32be-wBOM.txt
--- a/test_files/utf32be-woBOM.txt
+++ b/test_files/utf32be-woBOM.txt
--- a/test_files/utf32le-wBOM.txt
+++ b/test_files/utf32le-wBOM.txt
--- a/test_files/utf32le-woBOM.txt
+++ b/test_files/utf32le-woBOM.txt
--- a/utf8.go
+++ b/utf8.go
@ -51,8 +51,8 @@ func testUTF8bitPattern(b byte) (int, cp int32) {
 	return 0, 0
 }

-//IsUtf8 - return true if imput slice contain true UTF-8
-func IsUtf8(data []byte) bool {
+//ValidUTF8 - return true if imput slice contain true UTF-8
+func ValidUTF8(data []byte) bool {
 	m := len(data)
 	if m <= 1 {
 		return true
--- a/win1251.go
+++ b/win1251.go
@ -4,7 +4,6 @@ import "unicode"

 //unit for windows1251

-//TODO: нужно отличить от KOI-8r
 func runesMatch1251(data []byte, tbl *codePageTable) (counts int) {
 	for i := range data {
 		if i < 2 {