v0.2.0

2019-11-07 18:28:00 +04:00 · 2019-11-07 18:28:00 +04:00 · 4387137abc
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 *.zip
 *.7z
-.idea/*
+.idea/*
+tmp*
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -0,0 +1,17 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Launch",
+            "type": "go",
+            "request": "launch",
+            "mode": "auto",
+            "program": "${fileDirname}",
+            "env": {},
+            "args": []
+        }
+    ]
+}
--- a/char_frac.xlsx
+++ b/char_frac.xlsx
--- a/codePageTable.go
+++ b/codePageTable.go
@ -0,0 +1,23 @@
+package cpd
+
+//codePageTable
+
+//return index of rune in code page table
+//return 0 if rune not in code page table
+func (t *codePageTable) containsRune(r rune) int {
+	for j, e := range *t {
+		if r == e.code {
+			return j
+		}
+	}
+	return 0
+}
+
+func (t *codePageTable) isUpper(r rune) bool {
+	for i := 10; i < len(t); i++ {
+		if r == (*t)[i].code {
+			return true
+		}
+	}
+	return false
+}
--- a/code_pages.go
+++ b/code_pages.go
@ -0,0 +1,134 @@
+// file from "golang.org\x\text\encoding\internal\identifier" (c) golang autors
+// contain identifier of code page
+// IDCodePage implements interface String()
+
+package cpd
+
+import (
+	"fmt"
+	"strings"
+)
+
+//IDCodePage - index of code page
+type IDCodePage uint16
+
+func (i IDCodePage) String() string {
+	return codePageName[i]
+}
+
+//itRuneMatch - return 1 if rune from this code page, 0 else
+type itRuneMatch func(r rune, tbl *codePageTable) int
+
+//runesMatch - return count of entry elements of data to code page
+type runesMatch func(data []byte, tbl *codePageTable) int
+
+type tableElement struct {
+	code  rune //руна которая нас интересует, она присутствует в этой кодовой таблице как буква алфавита
+	count int  //количество вхождений данной руны
+}
+
+//codePageTable - содержит основные (наиболее часто встречаемые) символы алфавита в данной кодировке
+//первые 8 прописные, 2-я восьмёрка заглавные
+type codePageTable [19]tableElement
+
+//MatchRes - итоговый критерий совпадения массива данных с кодовой страницей
+type MatchRes struct {
+	countMatch int
+}
+
+//CodePage - содержит данные по конкретной кодовой странице
+type CodePage struct {
+	id       IDCodePage    //id of code page
+	name     string        //name of code page
+	MatchRes               //count of matching
+	match    runesMatch    //calculate from input data count of entry to codepage
+	table    codePageTable //table of main alfabet rune of this code page, use for calculate frequency
+}
+
+func (o CodePage) String() string {
+	return fmt.Sprintf("id: %s, countMatch: %d", o.id, o.countMatch)
+}
+
+//MatchingRunes - return string with rune/counts
+func (o CodePage) MatchingRunes() string {
+	var sb strings.Builder
+	fmt.Fprint(&sb, "rune/counts: ")
+	for i, e := range o.table {
+		if i != 0 {
+			fmt.Fprintf(&sb, "%x/%d, ", e.code, e.count)
+		}
+	}
+	return sb.String()
+}
+
+//TCodePages - type for store all code page
+type TCodePages []CodePage
+
+//DeepMach -
+func (o *TCodePages) DeepMach(data []byte) IDCodePage {
+	return ASCII
+}
+
+//Match - return IDCodePage
+//simple calculate count entry data runes in standart code page table
+func (o TCodePages) Match(data []byte) (result IDCodePage) {
+	result = ASCII
+	maxCount := 0
+	for i, cp := range o {
+		o[i].countMatch = cp.match(data, &o[i].table)
+		if o[i].countMatch > maxCount {
+			maxCount = o[i].countMatch
+			result = cp.id
+		}
+	}
+	return result
+}
+
+//CodePages - slice of code pages
+var CodePages = TCodePages{
+	{ASCII, "ASCII", MatchRes{0}, runesMatchASCII,
+		codePageTable{{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}},
+	{IBM866, "IBM866", MatchRes{0}, runesMatch866,
+		codePageTable{
+			//first element serves as sign of absence
+			{0, 0},
+			//о          е		   а		  и			 н			т			с		  р			в
+			{0xAE, 0}, {0xA5, 0}, {0xA0, 0}, {0xA8, 0}, {0xAD, 0}, {0xE2, 0}, {0xE1, 0}, {0xE0, 0}, {0xA2, 0},
+			{0x8E, 0}, {0x85, 0}, {0x80, 0}, {0x88, 0}, {0x8D, 0}, {0x92, 0}, {0x91, 0}, {0x90, 0}, {0x82, 0}}},
+	{UTF8, "UTF8", MatchRes{0}, runesMatchUTF8,
+		codePageTable{
+			{0, 0},
+			//о           е				а		    и			 н			  т			   с			р			в
+			{0xD0BE, 0}, {0xD0B5, 0}, {0xD0B0, 0}, {0xD0B8, 0}, {0xD0BD, 0}, {0xD182, 0}, {0xD181, 0}, {0xD180, 0}, {0xD0B2, 0},
+			{0xD09E, 0}, {0xD095, 0}, {0xD090, 0}, {0xD098, 0}, {0xD0AD, 0}, {0xD0A2, 0}, {0xD0A1, 0}, {0xD0A0, 0}, {0xD092, 0}}},
+	{Windows1251, "Windows1251", MatchRes{0}, runesMatch1251,
+		codePageTable{
+			{0, 0},
+			//а		    и		   н		  с			 р			в		   л		  к			в
+			{0xE0, 0}, {0xE8, 0}, {0xED, 0}, {0xF1, 0}, {0xF0, 0}, {0xE2, 0}, {0xEB, 0}, {0xEA, 0}, {0xE2, 0},
+			{0xC0, 0}, {0xC8, 0}, {0xCD, 0}, {0xD1, 0}, {0xD0, 0}, {0xC2, 0}, {0xCB, 0}, {0xCA, 0}, {0xC2, 0}}},
+	{KOI8R, "KOI8R", MatchRes{0}, runesMatchKOI8,
+		codePageTable{
+			//о		    а		   и		  т			 с			в		   л		  к			в
+			{0, 0},
+			{0xCF, 0}, {0xC1, 0}, {0xC9, 0}, {0xD4, 0}, {0xD3, 0}, {0xD7, 0}, {0xCC, 0}, {0xCB, 0}, {0xD7, 0},
+			{0xEF, 0}, {0xE1, 0}, {0xE9, 0}, {0xF4, 0}, {0xF3, 0}, {0xF7, 0}, {0xEC, 0}, {0xEB, 0}, {0xF7, 0}}},
+}
+
+//codePageName - string of code page name
+var codePageName = map[IDCodePage]string{
+	ASCII:              "ASCII",
+	IBM866:             "IBM866",
+	Windows1251:        "Windows1251",
+	UTF8:               "UTF8",
+	UTF16:              "UTF16",
+	UTF16LE:            "UTF16LE",
+	UTF16BE:            "UTF16BE",
+	UTF32:              "UTF32",
+	KOI8R:              "KOI8R",
+	ISO5427Cyrillic:    "ISO5427Cyrillic",
+	ISO51INISCyrillic:  "ISO51INISCyrillic",
+	ISO111ECMACyrillic: "ISO111ECMACyrillic",
+	ISO153GOST1976874:  "ISO153GOST1976874",
+	Unicode:            "Unicode",
+}
--- a/code_pages_id.go
+++ b/code_pages_id.go
@ -0,0 +1,111 @@
+package cpd
+
+const (
+	// ASCII is the uint16 identifier with IANA name US-ASCII (MIME: US-ASCII).
+	// ANSI X3.4-1986
+	// Reference: RFC2046
+	ASCII IDCodePage = 3
+
+	// ISO5427Cyrillic is the uint16 identifier with IANA name ISO_5427.
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO5427Cyrillic IDCodePage = 48
+
+	// ISO51INISCyrillic is the uint16 identifier with IANA name INIS-cyrillic.
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO51INISCyrillic IDCodePage = 53
+
+	// ISO111ECMACyrillic is the uint16 identifier with IANA name ECMA-cyrillic.
+	// ISO registry
+	// (formerly ECMA registry )
+	ISO111ECMACyrillic IDCodePage = 77
+
+	// ISO153GOST1976874 is the uint16 identifier with IANA name GOST_19768-74.
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO153GOST1976874 IDCodePage = 94
+
+	// UTF8 is the uint16 identifier with IANA name UTF-8.
+	//
+	// rfc3629
+	// Reference: RFC3629
+	UTF8 IDCodePage = 106
+
+	// Unicode is the uint16 identifier with IANA name ISO-10646-UCS-2.
+	//
+	// the 2-octet Basic Multilingual Plane, aka Unicode
+	// this needs to specify network byte order: the standard
+	// does not specify (it is a 16-bit integer space)
+	Unicode IDCodePage = 1000
+
+	// UnicodeASCII is the uint16 identifier with IANA name ISO-10646-UCS-Basic.
+	//
+	// ASCII subset of Unicode.  Basic Latin = collection 1
+	// See ISO 10646, Appendix A
+	UnicodeASCII IDCodePage = 1002
+
+	// UTF7 is the uint16 identifier with IANA name UTF-7.
+	//
+	// rfc2152
+	// Reference: RFC2152
+	UTF7 IDCodePage = 1012
+
+	// UTF16BE is the uint16 identifier with IANA name UTF-16BE.
+	//
+	// rfc2781
+	// Reference: RFC2781
+	UTF16BE IDCodePage = 1013
+
+	// UTF16LE is the uint16 identifier with IANA name UTF-16LE.
+	//
+	// rfc2781
+	// Reference: RFC2781
+	UTF16LE IDCodePage = 1014
+
+	// UTF16 is the uint16 identifier with IANA name UTF-16.
+	//
+	// rfc2781
+	// Reference: RFC2781
+	UTF16 IDCodePage = 1015
+
+	// UTF32 is the uint16 identifier with IANA name UTF-32.
+	//
+	// https://www.unicode.org/unicode/reports/tr19/
+	UTF32 IDCodePage = 1017
+
+	// UTF32BE is the uint16 identifier with IANA name UTF-32BE.
+	//
+	// https://www.unicode.org/unicode/reports/tr19/
+	UTF32BE IDCodePage = 1018
+
+	// UTF32LE is the uint16 identifier with IANA name UTF-32LE.
+	//
+	// https://www.unicode.org/unicode/reports/tr19/
+	UTF32LE IDCodePage = 1019
+
+	// KOI8R is the uint16 identifier with IANA name KOI8-R (MIME: KOI8-R).
+	//
+	// rfc1489 , based on GOST-19768-74, ISO-6937/8,
+	// INIS-Cyrillic, ISO-5427.
+	// Reference: RFC1489
+	KOI8R IDCodePage = 2084
+
+	// IBM866 is the uint16 identifier with IANA name IBM866.
+	//
+	// IBM NLDG Volume 2 (SE09-8002-03) August 1994
+	IBM866 IDCodePage = 2086
+
+	// Windows1251 is the uint16 identifier with IANA name windows-1251.
+	//
+	// Microsoft http://www.iana.org/assignments/charset-reg/windows-1251
+	Windows1251 IDCodePage = 2251
+
+	// Windows1252 is the uint16 identifier with IANA name windows-1252.
+	//
+	// Microsoft http://www.iana.org/assignments/charset-reg/windows-1252
+	Windows1252 IDCodePage = 2252
+)
--- a/const.go
+++ b/const.go
@ -1,15 +0,0 @@
-package cpd
-
-import (
-	"github.com/softlandia/cpd/internal/cp"
-)
-
-//numbers of code page
-const (
-	CpASCII       = cp.ASCII
-	CpWindows1251 = cp.Windows1251
-	CpIBM866      = cp.IBM866
-	CpUTF8        = cp.UTF8
-	CpUTF16       = cp.UTF16
-	CpUTF32       = cp.UTF32
-)
--- a/cp_deep_maching.go
+++ b/cp_deep_maching.go
@ -0,0 +1,45 @@
+package cpd
+
+//checkHeader - check buffer for match to utf-8, utf-16le or utf-16be BOM
+func checkHeader(b []byte) (id IDCodePage, res bool) {
+	if bomUTF8(b) {
+		return UTF8, true
+	}
+	if bomUTF16le(b) {
+		return UTF16LE, true
+	}
+	if bomUTF16be(b) {
+		return UTF16BE, true
+	}
+	return ASCII, false
+}
+
+func bomUTF8(b []byte) bool {
+	if len(b) < 3 {
+		return false
+	}
+	return (b[0] == 0xEF) && (b[1] == 0xBB) && (b[2] == 0xBF)
+}
+
+func bomUTF16le(b []byte) bool {
+	if len(b) < 2 {
+		return false
+	}
+	return (b[0] == 0xFF) && (b[1] == 0xFE)
+}
+
+func bomUTF16be(b []byte) bool {
+	if len(b) < 2 {
+		return false
+	}
+	return (b[0] == 0xFE) && (b[1] == 0xFF)
+}
+
+//ASCII block
+func itASCII(r rune, tbl *codePageTable) int {
+	return 0
+}
+
+func runesMatchASCII(b []byte, tbl *codePageTable) int {
+	return 0
+}
--- a/cpd.go
+++ b/cpd.go
@ -7,90 +7,57 @@ package cpd
 import (
 	"bufio"
 	"fmt"
+	"io"
 	"os"
-	"strings"
+	"reflect"

-	"github.com/softlandia/cpd/internal/cp"
 	"golang.org/x/text/encoding/charmap"
 	"golang.org/x/text/transform"
 )

-//StrConvertCodePage - convert string from one code page to another
-func StrConvertCodePage(s string, fromCP, toCP uint16) (string, error) {
-	if len(s) == 0 {
-		return "", nil
-	}
-	if fromCP == toCP {
-		return s, nil
-	}
-
-	var err error
-
-	switch fromCP {
-	case cp.IBM866:
-		s, _, err = transform.String(charmap.CodePage866.NewDecoder(), s)
-	case cp.Windows1251:
-		s, _, err = transform.String(charmap.Windows1251.NewDecoder(), s)
-	}
-	switch toCP {
-	case cp.IBM866:
-		s, _, err = transform.String(charmap.CodePage866.NewEncoder(), s)
-	case cp.Windows1251:
-		s, _, err = transform.String(charmap.Windows1251.NewEncoder(), s)
-	}
-	return s, err
+//CodePageAutoDetect - auto detect code page of input content
+func CodePageAutoDetect(content []byte) (result IDCodePage) {
+	return CodePages.Match(content)
 }

-// CodePageAsString - return name of char set with id codepage
-// if codepage not exist - return ""
-func CodePageAsString(codepage uint16) string {
-	return cp.Name[codepage]
+//CodePageDetect - detect code page of ascii data from reader 'r'
+func CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error) {
+	//initial test
+	//test input interfase
+	if !reflect.ValueOf(r).IsValid() {
+		return ASCII, fmt.Errorf("input reader is nil")
+	}
+
+	//make slice of byte from input reader
+	buf, err := bufio.NewReader(r).Peek(1024)
+	if (err != nil) && (err.Error() != "EOF") {
+		return ASCII, err
+	}
+
+	//check file header // utf-8, utf-16 with BOM
+	if idHeader, ok := checkHeader(buf); ok {
+		return idHeader, nil
+	}
+	return CodePageAutoDetect(buf), nil
 }

-//CodePageDetect - detect code page of file
-//return 0 if code page can not be detected
-//return const cpd.CpWindows1251 for Windows code page 1251
-//return const cdp.CpIBM866 for IBM 866 code page
-//return conts cdp.CpASCII by default or on error
-//EF-BB-BF utf8 bom
-func CodePageDetect(fn string, stopStr ...string) (uint16, error) {
-	var (
-		count1251 int //счётчик символов в кодировке 1251
-		count866  int //счётчик символов в кодировке 866
-	)
+//FileCodePageDetect - detect code page of text file
+func FileCodePageDetect(fn string, stopStr ...string) (IDCodePage, error) {

 	iFile, err := os.Open(fn)
 	if err != nil {
-		return CpASCII, err
+		return ASCII, err
 	}
 	defer iFile.Close()

-	iScanner := bufio.NewScanner(iFile)
-	for i := 0; iScanner.Scan(); i++ {
-		s := iScanner.Text()
-		if (len(stopStr) > 0) && strings.Contains(s, stopStr[0]) { //stopStr[0] - строка при обнаружении которой останавливаемся, stopStr - слайс строк
-			break
-		}
-		for j := range s {
-			if isRune1251(rune(s[j])) { //проверка принадлежности символа позициям алфавитных символов в кодовой таблице 1251
-				count1251++
-			}
-			if isRune866(rune(s[j])) { //проверка принадлежности символа позициям алфавитных символов в кодовой таблице 866
-				count866++
-			}
-		}
+	if len(stopStr) > 0 {
+		return CodePageDetect(iFile, stopStr[0])
 	}
-	switch {
-	case count1251 > count866:
-		return CpWindows1251, nil
-	case count1251 < count866:
-		return CpIBM866, nil
-	}
-	return CpASCII, nil
+	return CodePageDetect(iFile)
 }

 //FileConvertCodePage - replace code page text file from one to another
-func FileConvertCodePage(fileName string, fromCP, toCP uint16) error {
+func FileConvertCodePage(fileName string, fromCP, toCP IDCodePage) error {
 	if fromCP == toCP {
 		return nil
 	}
@ -126,37 +93,34 @@ func FileConvertCodePage(fileName string, fromCP, toCP uint16) error {
 	return os.Rename(tmpFileName, fileName)
 }

-const (
-	cp866r1Min  = 0x80 //заглавная буква А
-	cp866r1Max  = 0xAF //строчная буква п - в этом интервале в 866 раскладке лежит большинство русских букв
-	cp866r2Min  = 0xE0 //строчная р
-	cp866r2Max  = 0xF1 //строчна ё - в этом интервале лежат остальные русские буквы
-	cp1251s1    = 0xA8 //Ё
-	cp1251s2    = 0xB8 //ё в этой позиции в 866 лежит псевдографика
-	cp1251r1Min = 0xC0 //с этой позиции начинается весь алфавит
-	cp1251r1Max = 0xFF //заканчивается
-	cpKOI8RMin  = 0xC0 //начало интервала
-	cpKOI8RMax  = 0xFF //конец интервала
-)
-
-func isRune1251(r rune) bool {
-	switch {
-	case r == cp1251s1:
-		return true
-	case r == cp1251s2:
-		return true
-	case (r >= cp1251r1Min) && (r <= cp1251r1Max):
-		return true
+//StrConvertCodePage - convert string from one code page to another
+func StrConvertCodePage(s string, fromCP, toCP IDCodePage) (string, error) {
+	if len(s) == 0 {
+		return "", nil
 	}
-	return false
+	if fromCP == toCP {
+		return s, nil
+	}
+
+	var err error
+
+	switch fromCP {
+	case IBM866:
+		s, _, err = transform.String(charmap.CodePage866.NewDecoder(), s)
+	case Windows1251:
+		s, _, err = transform.String(charmap.Windows1251.NewDecoder(), s)
+	}
+	switch toCP {
+	case IBM866:
+		s, _, err = transform.String(charmap.CodePage866.NewEncoder(), s)
+	case Windows1251:
+		s, _, err = transform.String(charmap.Windows1251.NewEncoder(), s)
+	}
+	return s, err
 }

-func isRune866(r rune) bool {
-	switch {
-	case (r >= cp866r1Min) && (r <= cp866r1Max):
-		return true
-	case (r >= cp866r2Min) && (r <= cp866r2Max):
-		return true
-	}
-	return false
+// CodePageAsString - return name of char set with id codepage
+// if codepage not exist - return ""
+func CodePageAsString(codepage IDCodePage) string {
+	return codePageName[codepage]
 }
--- a/cpd_test.go
+++ b/cpd_test.go
@ -3,20 +3,18 @@
 import (
 	"os"
 	"testing"
-
-	"github.com/softlandia/cpd/internal/cp"
 )

 type tCodePageAsString struct {
-	id uint16
+	id IDCodePage
 	s  string
 }

 var dCodePageAsString = []tCodePageAsString{
 	{0, ""},
 	{3, "ASCII"},
-	{cp.IBM866, "IBM866"},
-	{cp.Windows1251, "Windows1251"},
+	{IBM866, "IBM866"},
+	{Windows1251, "Windows1251"},
 	{60000, ""},
 }

@ -29,44 +27,107 @@ func TestCodePageAsString(t *testing.T) {
 	}
 }

-//CodePageDetect
+//TestCodePageDetect - тестирование метода CodePageDetect
+// проверки на входные параметры:
+// 1. nil		входящий поток явный nil, параметр останова отсутствует
+// 2. nil, "~"	входящий поток явный nil, параметр останова присутствует
+// 3. входящий поток не инициализированный объект, проверка на передачу пустого интерфейса
+// проверка работы осуществляется через FileCodePageDetect()
 func TestCodePageDetect(t *testing.T) {
-	res, err := CodePageDetect("test_files\\866&1251.txt", "~X~") //befor ~X~ file contain 866, after 1251
-	if err != nil {
-		t.Errorf("<CodePageDetect> on file '%s' return error: %v", "866&1251.txt", err)
-	}
-	if res != cp.IBM866 {
-		t.Errorf("<CodePageDetect> on file '%s' expected 866 got: %s", "866&1251.txt", CodePageAsString(res))
-	}
-
-	res, err = CodePageDetect("test_files\\866&1251.txt") //file contain more 1251 then 866
-	if res != cp.Windows1251 {
-		t.Errorf("<CodePageDetect> on file '%s' expected 1251 got: %s", "866&1251.txt", CodePageAsString(res))
-	}
-
-	_, err = CodePageDetect("-.-") //file "-.-" not exist
+	_, err := CodePageDetect(nil)
 	if err == nil {
-		t.Errorf("<CodePageDetect> on file '-.-' must return error, but return nil")
+		t.Errorf("<CodePageDetect> on input nil return error == nil, expect error != nil\n")
+	}
+	_, err = CodePageDetect(nil, "~")
+	if err == nil {
+		t.Errorf("<CodePageDetect> on input nil return error == nil, expect error != nil\n")
 	}

-	res, _ = CodePageDetect("test_files\\noCodePage.txt") //file contain rune only ASCII
-	if res != cp.ASCII {
-		t.Errorf("<CodePageDetect> on file 'noCodePage.txt' expect ASCII got: %s", CodePageAsString(res))
+	var data *os.File
+	res, err := CodePageDetect(data, "~")
+	if err == nil {
+		t.Errorf("<CodePageDetect> on input nil return error != nil, data: %+v, res: %d, code page: %s\n", data, res, CodePageAsString(res))
+	}
+}
+
+func TestFileCodePageDetectSimple(t *testing.T) {
+	res, err := FileCodePageDetect("test_files\\866to1251.txt")
+	if err != nil {
+		t.Errorf("<FileCodePageDetect()> on file '866to1251.txt' err expected: nil, got: %s\n", err)
+	}
+	if res != IBM866 {
+		t.Errorf("<FileCodePageDetect()> on file '866to1251.txt' expected: %s, got: %s\n", IBM866, res)
+	}
+	res, err = FileCodePageDetect("test_files\\866&1251.txt")
+	if err != nil {
+		t.Errorf("<FileCodePageDetect()> on file '866&1251.txt' err expected: nil, got: %s\n", err)
+	}
+	if res != Windows1251 {
+		t.Errorf("<FileCodePageDetect()> on file '866&1251.txt' expected: %s, got: %s\n", Windows1251, res)
+	}
+}
+
+func TestFileCodePageDetectUtf8Bom(t *testing.T) {
+	res, err := FileCodePageDetect("test_files\\utf8wbom.txt")
+	if err != nil {
+		t.Errorf("<FileCodePageDetect()> on file 'utf8wbom.txt' err expected: nil, got: %s\n", err)
+	}
+	if res != UTF8 {
+		t.Errorf("<FileCodePageDetect()> on file 'utf8wbom.txt' expected: %s, got: %s\n", UTF8, res)
+	}
+}
+
+type tFileCodePageDetectTest struct {
+	fn string     //filename
+	st string     //stop string
+	e  error      //
+	r  IDCodePage //expected result
+}
+
+var dFileCodePageDetect = []tFileCodePageDetectTest{
+	{"test_files\\utf16BEwbom.txt", "", nil, UTF16BE},         //file contain utf16 big endian with bom rune at start
+	{"test_files\\utf16be-woBOM.txt", "", nil, UTF16BE},       //file contain utf16 big endian with out bom rune at start
+	{"test_files\\utf16le-wBOM.txt", "", nil, UTF16LE},        //file contain utf16 liitle endian with bom rune at start
+	{"test_files\\utf16le-woBOM.txt", "", nil, UTF16LE},       //file contain utf16 liitle endian with out bom rune at start
+	{"test_files\\utf8-woBOM.txt", "", nil, UTF8},             //file contain utf8 with out bom rune at start
+	{"test_files\\866&1251.txt", "~X~", nil, Windows1251},     //befor ~X~ file contain 866, after 1251
+	{"test_files\\866&1251.txt", "", nil, Windows1251},        //file contain more 1251 then 866
+	{"test_files\\noCodePage.txt", "", nil, ASCII},            //file contain rune only ASCII
+	{"test_files\\empty_file.txt", "", nil, ASCII},            //file exist but empty, no error, return ASCII
+	{"test_files\\rune_encode_error.txt", "", nil, ASCII},     //file contain special rune -> encode error, but detect NO error
+	{"test_files\\rune_error_1251.txt", "", nil, Windows1251}, //file contain 1251 and special rune -> encode error, but detect NO error
+	{"test_files\\utf8wbom.txt", "", nil, UTF8},               //file contain utf8 with bom rune at start
+	{"test_files\\utf16LEwbom.txt", "", nil, UTF16LE},         //file contain utf16 little endian with bom rune at start
+}
+
+//FileCodePageDetect
+func TestFileCodePageDetect(t *testing.T) {
+	var (
+		err error
+		res IDCodePage
+	)
+	for _, d := range dFileCodePageDetect {
+		if len(d.st) == 0 {
+			res, err = FileCodePageDetect(d.fn)
+		} else {
+			res, err = FileCodePageDetect(d.fn, d.st)
+		}
+		if err != d.e {
+			t.Errorf("<FileCodePageDetect> on file '%s' expected error:  '%v', got: '%v', ", d.fn, d.e, err)
+		}
+		if res != d.r {
+			t.Errorf("<FileCodePageDetect> on file '%s' expected result: %s, got: %s", d.fn, d.r, res)
+		}
 	}

-	res, err = CodePageDetect("test_files\\empty_file.txt")
-	if (res != cp.ASCII) || (err != nil) {
-		t.Errorf("<CodePageDetect> on file 'empty_file.txt' expect ASCII and no error got: %s and %v", CodePageAsString(res), err)
+	_, err = FileCodePageDetect("-.-") //file "-.-" not exist
+	if err == nil {
+		t.Errorf("<FileCodePageDetect> on file '-.-' must return error, but return nil")
 	}

-	res, err = CodePageDetect("test_files\\rune_encode_error.txt")
-	if (res != cp.ASCII) || (err != nil) {
-		t.Errorf("<CodePageDetect> on file 'rune_encode_error.txt' expect ASCII and no error got: %s and %v", CodePageAsString(res), err)
-	}
-
-	res, err = CodePageDetect("test_files\\rune_error_1251.txt")
-	if res != cp.Windows1251 {
-		t.Errorf("<CodePageDetect> on file 'rune_error_1251.txt' expect 1251 and no error got: %s and %v", CodePageAsString(res), err)
+	_, err = FileCodePageDetect("") //file "" not exist
+	if err == nil {
+		t.Errorf("<FileCodePageDetect> on file '' must return error, but return nil")
 	}

 }
@ -83,13 +144,13 @@ func TestFileConvertCodePage(t *testing.T) {
 		t.Errorf("<FileConvertCodePage> on fromCp == toCp expected error==nil, got: %v", err)
 	}

-	err = FileConvertCodePage("test_files\\rune_encode_error.txt", cp.IBM866, cp.Windows1251)
+	err = FileConvertCodePage("test_files\\rune_encode_error.txt", IBM866, Windows1251)
 	if err == nil {
 		t.Errorf("<FileConvertCodePage> expected error, got: %v", err)
 	}

 	os.Link("test_files\\866to1251.txt", "test_files\\866to1251.tmp")
-	err = FileConvertCodePage("test_files\\866to1251.tmp", cp.IBM866, cp.Windows1251)
+	err = FileConvertCodePage("test_files\\866to1251.tmp", IBM866, Windows1251)
 	if err != nil {
 		t.Errorf("<FileConvertCodePage> expect no err, got: %v", err)
 	}
@ -98,19 +159,19 @@ func TestFileConvertCodePage(t *testing.T) {

 //ConvertCodePage
 func TestStrConvertCodePage(t *testing.T) {
-	_, err := StrConvertCodePage("1234", cp.IBM866, cp.Windows1251)
+	_, err := StrConvertCodePage("1234", IBM866, Windows1251)
 	if err != nil {
 		t.Errorf("<StrConvertCodePage> on test 1 return unexpected err: %v", err)
 	}
-	_, err = StrConvertCodePage("1234", cp.Windows1251, cp.IBM866)
+	_, err = StrConvertCodePage("1234", Windows1251, IBM866)
 	if err != nil {
 		t.Errorf("<StrConvertCodePage> on test 2 return unexpected err: %v", err)
 	}
-	_, err = StrConvertCodePage("", cp.IBM866, cp.Windows1251)
+	_, err = StrConvertCodePage("", IBM866, Windows1251)
 	if err != nil {
 		t.Errorf("<StrConvertCodePage> with empty string must return ERROR, but retrurn: %v", err)
 	}
-	_, err = StrConvertCodePage("1234", cp.IBM866, cp.IBM866)
+	_, err = StrConvertCodePage("1234", IBM866, IBM866)
 	if err != nil {
 		t.Errorf("<StrConvertCodePage> with equal fromCP and toCp must return nil, but retrurn: %v", err)
 	}
--- a/ibm866.go
+++ b/ibm866.go
@ -0,0 +1,19 @@
+package cpd
+
+//unit for ibm866
+
+func runesMatch866(data []byte, tbl *codePageTable) (counts int) {
+	for i := range data {
+		if i == 0 {
+			continue
+		}
+		if tbl.containsRune(rune(data[i-1])) > 0 {
+			j := tbl.containsRune(rune(data[i]))
+			if j > 0 {
+				(*tbl)[j].count++
+				counts++
+			}
+		}
+	}
+	return
+}
--- a/internal/cp/mib.go
+++ b/internal/cp/mib.go
--- a/koi8.go
+++ b/koi8.go
@ -0,0 +1,49 @@
+package cpd
+
+import "unicode"
+
+//unit for koi-8
+
+func runesMatchKOI8(data []byte, tbl *codePageTable) (counts int) {
+	for i := range data {
+		if i < 2 {
+			continue
+		}
+		//case " Us" - separator_UPPER_symbol
+		if unicode.IsPunct(rune(data[i-2])) && isUpperKOI8(rune(data[i-1])) {
+			j := tbl.containsRune(rune(data[i]))
+			if j > 0 {
+				(*tbl)[j].count++
+				counts++
+				continue
+			}
+		}
+		if isKOI8(rune(data[i-1])) {
+			j := tbl.containsRune(rune(data[i]))
+			if j > 0 {
+				(*tbl)[j].count++
+				counts++
+			}
+		}
+	}
+	return
+}
+
+const (
+	cpKOI8StartUpperChar = 0xE0
+	cpKOI8StopUpperChar  = 0xFF
+	cpKOI8StartLowerChar = 0xC0
+	cpKOI8StopLowerChar  = 0xDF
+)
+
+func isUpperKOI8(r rune) bool {
+	return (r >= cpKOI8StartUpperChar) && (r <= cpKOI8StopUpperChar)
+}
+
+func isLowerKOI8(r rune) bool {
+	return (r >= cpKOI8StartLowerChar) && (r <= cpKOI8StopLowerChar)
+}
+
+func isKOI8(r rune) bool {
+	return isUpperKOI8(r) || isLowerKOI8(r)
+}
--- a/test_files/IBM866.txt
+++ b/test_files/IBM866.txt
@ -0,0 +1 @@
+<EFBFBD>ãááª¨© ¢ ª®¤¨à®¢ª¥      IBM866
--- a/test_files/KOI8-r.txt
+++ b/test_files/KOI8-r.txt
@ -0,0 +1 @@
+Русский в кодировке       KOI8r
--- a/test_files/Win1251.txt
+++ b/test_files/Win1251.txt
@ -0,0 +1 @@
+Русский в кодировке Windows1251
--- a/test_files/noCodePage.txt
+++ b/test_files/noCodePage.txt
@ -1 +1 @@
-<EFBFBD>
+0
--- a/test_files/utf16BEwbom.txt
+++ b/test_files/utf16BEwbom.txt
--- a/test_files/utf16LEwbom.txt
+++ b/test_files/utf16LEwbom.txt
--- a/test_files/utf16be-wBOM.txt
+++ b/test_files/utf16be-wBOM.txt
--- a/test_files/utf16be-woBOM.txt
+++ b/test_files/utf16be-woBOM.txt
--- a/test_files/utf16le-wBOM.txt
+++ b/test_files/utf16le-wBOM.txt
--- a/test_files/utf16le-woBOM.txt
+++ b/test_files/utf16le-woBOM.txt
--- a/test_files/utf8-wBOM.txt
+++ b/test_files/utf8-wBOM.txt
@ -0,0 +1 @@
+Русский в кодировке        UTF8
--- a/test_files/utf8-woBOM.txt
+++ b/test_files/utf8-woBOM.txt
@ -0,0 +1 @@
+Русский в кодировке        UTF8
--- a/test_files/utf8.txt
+++ b/test_files/utf8.txt
@ -0,0 +1 @@
+Utf8 w/o bom Русский
--- a/test_files/utf8wbom.txt
+++ b/test_files/utf8wbom.txt
@ -0,0 +1,2 @@
+code page UTF8
+Русский
--- a/utf8.go
+++ b/utf8.go
@ -0,0 +1,22 @@
+package cpd
+
+import "encoding/binary"
+
+//unit for UTF8
+
+func runesMatchUTF8(data []byte, tbl *codePageTable) (counts int) {
+	n := len(data)/2 - 1
+	if n <= 0 {
+		return
+	}
+	for i := 0; i < n; i += 2 {
+		t := data[i : i+2]
+		d := binary.BigEndian.Uint16(t)
+		j := tbl.containsRune(rune(d))
+		if j > 0 {
+			(*tbl)[j].count++
+			counts++
+		}
+	}
+	return
+}
--- a/win1251.go
+++ b/win1251.go
@ -0,0 +1,51 @@
+package cpd
+
+import "unicode"
+
+//unit for windows1251
+
+//TODO: нужно отличить от KOI-8r
+func runesMatch1251(data []byte, tbl *codePageTable) (counts int) {
+	for i := range data {
+		if i < 2 {
+			continue
+		}
+		//case " Us" - separator_UPPER_symbol
+		if unicode.IsPunct(rune(data[i-2])) && isUpper1251(rune(data[i-1])) {
+			j := tbl.containsRune(rune(data[i]))
+			if j > 0 {
+				(*tbl)[j].count++
+				counts++
+				continue
+			}
+		}
+		//case "ab" - counts only if symbols are arranged in pairs
+		if is1251(rune(data[i-1])) {
+			j := tbl.containsRune(rune(data[i]))
+			if j > 0 {
+				(*tbl)[j].count++
+				counts++
+			}
+		}
+	}
+	return
+}
+
+const (
+	cp1251StartUpperChar = 0xC0
+	cp1251StopUpperChar  = 0xDF
+	cp1251StartLowerChar = 0xE0
+	cp1251StopLowerChar  = 0xFF
+)
+
+func isUpper1251(r rune) bool {
+	return (r >= cp1251StartUpperChar) && (r <= cp1251StopUpperChar)
+}
+
+func isLower1251(r rune) bool {
+	return (r >= cp1251StartLowerChar) && (r <= cp1251StopLowerChar)
+}
+
+func is1251(r rune) bool {
+	return isUpper1251(r) || isLower1251(r)
+}
--- a/букв.xlsx
+++ b/букв.xlsx
				`@ -0,0 +1 @@`
				`<EFBFBD>ãááª¨© ¢ ª®¤¨à®¢ª¥ IBM866`
				`@ -0,0 +1 @@`
				`Русский в кодировке Windows1251`