v0.2.3 add UTF32 w/o BOM and ValidUtf8

2019-11-25 19:54:27 +04:00 · 2019-11-25 19:54:27 +04:00 · 45a5c66ef9
--- a/README-RU.md
+++ b/README-RU.md
@ -5,7 +5,7 @@
 >download: go get -u github.com/softlandia/cpd  
 >install: go install

-библиотека на golang
+библиотека для golang

 предназначена для автоматического определения кодовой страницы текстовых файлов или потоков байт  
 поддерживает следующие кодовые страницы:
--- a/char_frac.xlsx
+++ b/char_frac.xlsx
--- a/code_pages.go
+++ b/code_pages.go
@ -66,6 +66,13 @@ func (o CodePage) MatchingRunes() string {
 //TCodepagesDic - type to store all supported code page
 type TCodepagesDic map[IDCodePage]CodePage

+func (o TCodepagesDic) clearMatchCount() {
+	for id, cp := range o {
+		cp.countMatch = 0
+		o[id] = cp
+	}
+}
+
 //Match - return the id of code page to which the data best matches
 func (o TCodepagesDic) Match(data []byte) (result IDCodePage) {
 	result = ASCII
@ -93,7 +100,7 @@ var CodepageDic = TCodepagesDic{
 			//о          е		   а		  и			 н			т			с		  р			в
 			{0xAE, 0}, {0xA5, 0}, {0xA0, 0}, {0xA8, 0}, {0xAD, 0}, {0xE2, 0}, {0xE1, 0}, {0xE0, 0}, {0xA2, 0},
 			{0x8E, 0}, {0x85, 0}, {0x80, 0}, {0x88, 0}, {0x8D, 0}, {0x92, 0}, {0x91, 0}, {0x90, 0}, {0x82, 0}}},
-	UTF8: {UTF8, "UTF8", MatchRes{0}, runesMatchUTF8,
+	UTF8: {UTF8, "UTF-8", MatchRes{0}, runesMatchUTF8,
 		codePageTable{
 			{0, 0},
 			//о           е				а		    и			 н			  т			   с			р			в
@ -102,10 +109,10 @@ var CodepageDic = TCodepagesDic{
 	Windows1251: {Windows1251, "Windows1251", MatchRes{0}, runesMatch1251,
 		codePageTable{
 			{0, 0},
-			//а		    и		   н		  с			 р			в		   л		  к			в
-			{0xE0, 0}, {0xE8, 0}, {0xED, 0}, {0xF1, 0}, {0xF0, 0}, {0xE2, 0}, {0xEB, 0}, {0xEA, 0}, {0xE2, 0},
-			{0xC0, 0}, {0xC8, 0}, {0xCD, 0}, {0xD1, 0}, {0xD0, 0}, {0xC2, 0}, {0xCB, 0}, {0xCA, 0}, {0xC2, 0}}},
-	KOI8R: {KOI8R, "KOI8R", MatchRes{0}, runesMatchKOI8,
+			//а		    и		   н		  с			 р			в		   л		  к			у
+			{0xE0, 0}, {0xE8, 0}, {0xED, 0}, {0xF1, 0}, {0xF0, 0}, {0xE2, 0}, {0xEB, 0}, {0xEA, 0}, {0xF3, 0},
+			{0xC0, 0}, {0xC8, 0}, {0xCD, 0}, {0xD1, 0}, {0xD0, 0}, {0xC2, 0}, {0xCB, 0}, {0xCA, 0}, {0xD3, 0}}},
+	KOI8R: {KOI8R, "KOI8-R", MatchRes{0}, runesMatchKOI8,
 		codePageTable{
 			//о		    а		   и		  т			 с			в		   л		  к			м
 			{0, 0},
@ -117,21 +124,31 @@ var CodepageDic = TCodepagesDic{
 			{0, 0},
 			{0xDE, 0}, {0xD0, 0}, {0xD8, 0}, {0xE2, 0}, {0xE1, 0}, {0xD2, 0}, {0xDB, 0}, {0xDA, 0}, {0xD5, 0},
 			{0xBF, 0}, {0xB0, 0}, {0xB8, 0}, {0xC2, 0}, {0xC1, 0}, {0xB2, 0}, {0xBB, 0}, {0xBA, 0}, {0xB5, 0}}},
-	UTF16LE: {UTF16LE, "UTF16LE", MatchRes{0}, runesMatchUTF16LE,
+	UTF16LE: {UTF16LE, "UTF-16LE", MatchRes{0}, runesMatchUTF16LE,
 		codePageTable{
 			{0, 0},
 			//о           е				а		    и			 н			  т			   с			р			в
 			{0x3E04, 0}, {0x3504, 0}, {0x1004, 0}, {0x3804, 0}, {0x3D04, 0}, {0x4204, 0}, {0x4104, 0}, {0x4004, 0}, {0x3204, 0},
 			{0x1E04, 0}, {0x1504, 0}, {0x3004, 0}, {0x1804, 0}, {0x1D04, 0}, {0x2204, 0}, {0x2104, 0}, {0x2004, 0}, {0x1204, 0}}},
-	UTF16BE: {UTF16BE, "UTF16BE", MatchRes{0}, runesMatchUTF16BE,
+	UTF16BE: {UTF16BE, "UTF-16BE", MatchRes{0}, runesMatchUTF16BE,
 		codePageTable{
 			{0, 0},
 			//о           е				а		    и			 н			  т			   с			р			в
 			{0x043E, 0}, {0x0435, 0}, {0x0410, 0}, {0x0438, 0}, {0x043D, 0}, {0x0442, 0}, {0x0441, 0}, {0x0440, 0}, {0x0432, 0},
 			{0x041E, 0}, {0x0415, 0}, {0x0430, 0}, {0x0418, 0}, {0x041D, 0}, {0x0422, 0}, {0x0421, 0}, {0x0420, 0}, {0x0412, 0}}},
+	UTF32BE: {UTF32BE, "UTF-32BE", MatchRes{0}, runesMatchUTF32BE,
+		codePageTable{
+			{0, 0},
+			{0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0},
+			{0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}}},
+	UTF32LE: {UTF32LE, "UTF-32LE", MatchRes{0}, runesMatchUTF32LE,
+		codePageTable{
+			{0, 0},
+			{0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0},
+			{0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}}},
 }

-//codePageName - string of code page name
+//codePageName - string of code page name runesMatchUTF32LE
 var codePageName = map[IDCodePage]string{
 	ASCII:            "ASCII",
 	ISOLatinCyrillic: "ISO-8859-5",
@ -147,77 +164,3 @@ var codePageName = map[IDCodePage]string{
 	UTF32LE:          "UTF-32LE",
 	UTF32BE:          "UTF-32BE",
 }
-
-/*
-//TCodePages - type for store all code page
-type TCodePages []CodePage
-
-//Match - return IDCodePage
-//simple calculate count entry data runes in standart code page table
-func (o TCodePages) Match(data []byte) (result IDCodePage) {
-	result = ASCII
-	maxCount := 0
-	for i, cp := range o {
-		o[i].countMatch = cp.match(data, &o[i].table)
-		if o[i].countMatch > maxCount {
-			maxCount = o[i].countMatch
-			result = cp.id
-		}
-	}
-	return result
-}
-
-//DeepMach -
-func (o *TCodePages) DeepMach(data []byte) IDCodePage {
-	return ASCII
-}
-
-//CodePages - slice of code pages
-var CodePages = TCodePages{
-	{ASCII, "ASCII", MatchRes{0}, runesMatchASCII,
-		codePageTable{{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}},
-	{IBM866, "IBM866", MatchRes{0}, runesMatch866,
-		codePageTable{
-			//first element serves as sign of absence
-			{0, 0},
-			//о          е		   а		  и			 н			т			с		  р			в
-			{0xAE, 0}, {0xA5, 0}, {0xA0, 0}, {0xA8, 0}, {0xAD, 0}, {0xE2, 0}, {0xE1, 0}, {0xE0, 0}, {0xA2, 0},
-			{0x8E, 0}, {0x85, 0}, {0x80, 0}, {0x88, 0}, {0x8D, 0}, {0x92, 0}, {0x91, 0}, {0x90, 0}, {0x82, 0}}},
-	{UTF8, "UTF8", MatchRes{0}, runesMatchUTF8,
-		codePageTable{
-			{0, 0},
-			//о           е				а		    и			 н			  т			   с			р			в
-			{0xD0BE, 0}, {0xD0B5, 0}, {0xD0B0, 0}, {0xD0B8, 0}, {0xD0BD, 0}, {0xD182, 0}, {0xD181, 0}, {0xD180, 0}, {0xD0B2, 0},
-			{0xD09E, 0}, {0xD095, 0}, {0xD090, 0}, {0xD098, 0}, {0xD0AD, 0}, {0xD0A2, 0}, {0xD0A1, 0}, {0xD0A0, 0}, {0xD092, 0}}},
-	{Windows1251, "Windows1251", MatchRes{0}, runesMatch1251,
-		codePageTable{
-			{0, 0},
-			//а		    и		   н		  с			 р			в		   л		  к			в
-			{0xE0, 0}, {0xE8, 0}, {0xED, 0}, {0xF1, 0}, {0xF0, 0}, {0xE2, 0}, {0xEB, 0}, {0xEA, 0}, {0xE2, 0},
-			{0xC0, 0}, {0xC8, 0}, {0xCD, 0}, {0xD1, 0}, {0xD0, 0}, {0xC2, 0}, {0xCB, 0}, {0xCA, 0}, {0xC2, 0}}},
-	{KOI8R, "KOI8R", MatchRes{0}, runesMatchKOI8,
-		codePageTable{
-			//о		    а		   и		  т			 с			в		   л		  к			м
-			{0, 0},
-			{0xCF, 0}, {0xC1, 0}, {0xC9, 0}, {0xD4, 0}, {0xD3, 0}, {0xD7, 0}, {0xCC, 0}, {0xCB, 0}, {0xCD, 0},
-			{0xEF, 0}, {0xE1, 0}, {0xE9, 0}, {0xF4, 0}, {0xF3, 0}, {0xF7, 0}, {0xEC, 0}, {0xEB, 0}, {0xED, 0}}},
-	{ISOLatinCyrillic, "ISO-8859-5", MatchRes{0}, runesMatchISO88595,
-		codePageTable{
-			//о		    а		   и		  т			 с			в		   л		  к			е
-			{0, 0},
-			{0xDE, 0}, {0xD0, 0}, {0xD8, 0}, {0xE2, 0}, {0xE1, 0}, {0xD2, 0}, {0xDB, 0}, {0xDA, 0}, {0xD5, 0},
-			{0xBF, 0}, {0xB0, 0}, {0xB8, 0}, {0xC2, 0}, {0xC1, 0}, {0xB2, 0}, {0xBB, 0}, {0xBA, 0}, {0xB5, 0}}},
-	{UTF16LE, "UTF16LE", MatchRes{0}, runesMatchUTF16LE,
-		codePageTable{
-			{0, 0},
-			//о           е				а		    и			 н			  т			   с			р			в
-			{0x3E04, 0}, {0x3504, 0}, {0x1004, 0}, {0x3804, 0}, {0x3D04, 0}, {0x4204, 0}, {0x4104, 0}, {0x4004, 0}, {0x3204, 0},
-			{0x1E04, 0}, {0x1504, 0}, {0x3004, 0}, {0x1804, 0}, {0x1D04, 0}, {0x2204, 0}, {0x2104, 0}, {0x2004, 0}, {0x1204, 0}}},
-	{UTF16BE, "UTF16BE", MatchRes{0}, runesMatchUTF16BE,
-		codePageTable{
-			{0, 0},
-			//о           е				а		    и			 н			  т			   с			р			в
-			{0x043E, 0}, {0x0435, 0}, {0x0410, 0}, {0x0438, 0}, {0x043D, 0}, {0x0442, 0}, {0x0441, 0}, {0x0440, 0}, {0x0432, 0},
-			{0x041E, 0}, {0x0415, 0}, {0x0430, 0}, {0x0418, 0}, {0x041D, 0}, {0x0422, 0}, {0x0421, 0}, {0x0420, 0}, {0x0412, 0}}},
-}
-*/
--- a/cpd.go
+++ b/cpd.go
@ -45,7 +45,11 @@ func CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error) {
 		return ASCII, err
 	}

-	//is buf contains the BOM of utf-8, utf-16le or utf-16be
+	//возможно определение произойдёт по BOM или по валидности UTF тогда количество попаданий по кодовым страницам лучше занулить
+	//на работу это повлиять не может, но если вывести статистику попаданий, то цифры с предыдущего определения
+	CodepageDic.clearMatchCount()
+
+	//is buf contains the BOM of utf-8, utf-16le, utf-16be, utf-32le or utf-32be
 	if idCodePage, ok := CheckBOM(buf); ok {
 		return idCodePage, nil
 	}
--- a/cpd_test.go
+++ b/cpd_test.go
@ -35,23 +35,28 @@ type tFileCodePageDetectTest struct {
 }

 var dFileCodePageDetect = []tFileCodePageDetectTest{
-	{"test_files\\utf32le-wBOM.txt", "", nil, UTF32LE},        //file contain utf32 little endian with bom
-	{"test_files\\KOI8-r.txt", "", nil, KOI8R},                //file contain KOI8
-	{"test_files\\IBM866.txt", "", nil, CP866},                //file contain IBM866
-	{"test_files\\Win1251.txt", "", nil, Windows1251},         //file contain Windows1251
-	{"test_files\\utf8-woBOM.txt", "", nil, UTF8},             //file contain utf8 with out bom rune at start
 	{"test_files\\866&1251.txt", "", nil, Windows1251},        //file contain more 1251 then 866
-	{"test_files\\noCodePage.txt", "", nil, UTF8},             //file contain rune only ASCII
 	{"test_files\\empty_file.txt", "", nil, UTF8},             //file exist but empty, no error, return ASCII
+	{"test_files\\IBM866.txt", "", nil, CP866},                //file contain IBM866
+	{"test_files\\ISO8859-5.txt", "", nil, ISOLatinCyrillic},  //file contain ISO8859-5
+	{"test_files\\KOI8-r.txt", "", nil, KOI8R},                //file contain KOI8
+	{"test_files\\KOI8-r2.txt", "", nil, KOI8R},               //file contain KOI8
+	{"test_files\\noCodePage.txt", "", nil, UTF8},             //file contain rune only ASCII
 	{"test_files\\rune_encode_error.txt", "", nil, ASCII},     //file contain special rune -> encode error, but detect NO error
 	{"test_files\\rune_error_1251.txt", "", nil, Windows1251}, //file contain 1251 and special rune -> encode error, but detect NO error
-	{"test_files\\utf8wbom.txt", "", nil, UTF8},               //file contain utf8 with bom prefix
-	{"test_files\\utf16LEwbom.txt", "", nil, UTF16LE},         //file contain utf16 little endian with BOM
-	{"test_files\\utf16BEwbom.txt", "", nil, UTF16BE},         //file contain utf16 big endian with BOM
+	{"test_files\\utf8.txt", "", nil, UTF8},                   //file contain utf8 with out bom rune at start
+	{"test_files\\utf8-wbom.txt", "", nil, UTF8},              //file contain utf8 with bom prefix
+	{"test_files\\utf8-woBOM.txt", "", nil, UTF8},             //file contain utf8 with out bom rune at start
+	{"test_files\\utf16be-wBOM.txt", "", nil, UTF16BE},        //file contain utf16 big endian with bom
+	{"test_files\\utf16be-woBOM.txt", "", nil, UTF16BE},       //file contain utf16 big endian without bom
 	{"test_files\\utf16le-wBOM.txt", "", nil, UTF16LE},        //file contain utf16 little endian with bom
 	{"test_files\\utf16le-woBOM.txt", "", nil, UTF16LE},       //file contain utf16 little endian without bom
-	{"test_files\\utf16be-woBOM.txt", "", nil, UTF16BE},       //file contain utf16 big endian without bom
 	{"test_files\\utf32be-wBOM.txt", "", nil, UTF32BE},        //file contain utf32 big endian with bom
+	{"test_files\\utf32be-woBOM.txt", "", nil, UTF32BE},       //file contain utf32 big endian without bom
+	{"test_files\\utf32le-wBOM.txt", "", nil, UTF32LE},        //file contain utf32 little endian with bom
+	{"test_files\\utf32le-woBOM.txt", "", nil, UTF32LE},       //file contain utf32 little endian without bom
+	{"test_files\\Win1251.txt", "", nil, Windows1251},         //file contain Windows1251
+	{"test_files\\Win1251Test.txt", "", nil, Windows1251},     //file contain Windows1251
 }

 //FileCodePageDetect
@ -123,12 +128,12 @@ func TestFileCodePageDetectSimple(t *testing.T) {
 }

 func TestFileCodePageDetectUtf8Bom(t *testing.T) {
-	res, err := FileCodePageDetect("test_files\\utf8wbom.txt")
+	res, err := FileCodePageDetect("test_files\\utf8-wBOM.txt")
 	if err != nil {
-		t.Errorf("<FileCodePageDetect()> on file 'utf8wbom.txt' err expected: nil, got: %s\n", err)
+		t.Errorf("<FileCodePageDetect()> on file 'utf8-wBOM.txt' err expected: nil, got: %s\n", err)
 	}
 	if res != UTF8 {
-		t.Errorf("<FileCodePageDetect()> on file 'utf8wbom.txt' expected: %s, got: %s\n", UTF8, res)
+		t.Errorf("<FileCodePageDetect()> on file 'utf8-wBOM.txt' expected: %s, got: %s\n", UTF8, res)
 	}
 }

--- a/test_files/KOI8-r2.txt
+++ b/test_files/KOI8-r2.txt
@ -0,0 +1 @@
+ПРИМЕР ТЕКСТА ЗАГЛАВНЫМИ KOI8-r
--- a/test_files/utf16BEwbom.txt
+++ b/test_files/utf16BEwbom.txt
--- a/test_files/utf16LEwbom.txt
+++ b/test_files/utf16LEwbom.txt
--- a/test_files/utf32be-woBOM.txt
+++ b/test_files/utf32be-woBOM.txt
--- a/test_files/utf32le-woBOM.txt
+++ b/test_files/utf32le-woBOM.txt
--- a/test_files/utf8-woBOM.txt
+++ b/test_files/utf8-woBOM.txt
@ -1 +1 @@
-Русский в кодировке        UTF8
+Русский в кодировке        UTF8 ンラ на японском
--- a/test_files/utf8wbom.txt
+++ b/test_files/utf8wbom.txt
@ -1,2 +0,0 @@
-code page UTF8
-Русский
--- a/test_files/win1251Test.txt
+++ b/test_files/win1251Test.txt
@ -0,0 +1 @@
+ПРИМЕР 1251 ТОЧНО ЖУЙ ЭТИ БУЛОЧКИ
--- a/utf32be.go
+++ b/utf32be.go
@ -0,0 +1,21 @@
+package cpd
+
+//UTF-32BE
+
+//первые 2 байта практически всегда меньше вторых 2 байтов
+func runesMatchUTF32BE(d []byte, tbl *codePageTable) (counts int) {
+	var (
+		w1 int64
+		w2 int64
+	)
+	for i := 0; i < len(d)-4; i += 4 {
+		w1 = int64(d[i]) * int64(d[i+1])
+		w2 = int64(d[i+2]) * int64(d[i+3])
+		if w1 > w2 {
+			counts = 0
+			break
+		}
+		counts++
+	}
+	return
+}
--- a/utf32le.go
+++ b/utf32le.go
@ -0,0 +1,22 @@
+package cpd
+
+//UTF-32LE
+
+//первые 2 байта практически всегда больше вторых 2 байтов
+//TODO лучше попробовать оценить количество 0x0 байтов по отношению к общему и если их много, то только тогла определять LE/BE
+func runesMatchUTF32LE(d []byte, tbl *codePageTable) (counts int) {
+	var (
+		w1 int64
+		w2 int64
+	)
+	for i := 0; i < len(d)-4; i += 4 {
+		w1 = int64(d[i]) * int64(d[i+1])
+		w2 = int64(d[i+2]) * int64(d[i+3])
+		if w1 < w2 {
+			counts = 0 //все первые должны быть больше, иначе это не UTF-32le
+			break
+		}
+		counts++
+	}
+	return
+}
--- a/utf8.go
+++ b/utf8.go
@ -4,18 +4,16 @@ import "encoding/binary"

 //unit for UTF8

-func runesMatchUTF8(data []byte, tbl *codePageTable) (counts int) {
-	n := len(data)/2 - 1
-	if n <= 0 {
+func runesMatchUTF8(d []byte, tbl *codePageTable) (counts int) {
+	if len(d) <= 3 {
 		return
 	}
-	for i := 0; i < n; i += 2 {
-		t := data[i : i+2]
+	for i := 0; i < len(d)-3; i++ {
+		t := d[i : i+2]
 		d := binary.BigEndian.Uint16(t)
 		j := tbl.containsRune(rune(d))
 		if j > 0 {
 			(*tbl)[j].count++
-			//counts++
 		}
 		if isUTF8(rune(d)) {
 			counts++
@ -63,16 +61,19 @@ func ValidUTF8(data []byte) bool {
 			zerroByteCount++
 		}
 		n, cp := testUTF8bitPattern(data[i])
+		//n - количество байт следующих за этим которые будут использоваться для отображения данных
+		//n == 0 быть не может, это получается если битовая маска 1100 0000 -> для первого байта UTF-8 это не допустимо
 		if n == 0 {
 			return false
 		}
 		i++
 		var j int32 = 1
-		for ; j < n; j++ {
-			if (data[j] & 0xC0) != 0x80 {
+		for j = 1; j < n; j++ {
+			//байты с данными должны иметь маску 10xx xxxx
+			if (data[i] & 0xC0) != 0x80 {
 				return false
 			}
-			cp = (cp << 6) | int32(data[j]&0x3F)
+			cp = (cp << 6) | int32(data[i]&0x3F)
 			i++
 		}

--- a/win1251.go
+++ b/win1251.go
@ -6,18 +6,9 @@ import "unicode"

 func runesMatch1251(data []byte, tbl *codePageTable) (counts int) {
 	for i := range data {
-		if i < 2 {
+		if i < 1 {
 			continue
 		}
-		//case " Us" - separator_UPPER_symbol
-		if unicode.IsPunct(rune(data[i-2])) && isUpper1251(rune(data[i-1])) {
-			j := tbl.containsRune(rune(data[i]))
-			if j > 0 {
-				(*tbl)[j].count++
-				counts++
-				continue
-			}
-		}
 		//case "ab" - counts only if symbols are arranged in pairs
 		if is1251(rune(data[i-1])) {
 			j := tbl.containsRune(rune(data[i]))
@ -25,6 +16,20 @@ func runesMatch1251(data []byte, tbl *codePageTable) (counts int) {
 				(*tbl)[j].count++
 				counts++
 			}
+			continue
+		}
+		if i < 2 {
+			continue
+		}
+		//case " Us" or ".Us"  separator_UPPER_lower
+		//IsPunct -
+		if (unicode.IsPunct(rune(data[i-2])) || unicode.IsSpace(rune(data[i-2]))) && isUpper1251(rune(data[i-1])) {
+			j := tbl.containsRune(rune(data[i]))
+			if (j > 0) && (isLower1251(rune(data[i]))) {
+				(*tbl)[j].count++
+				counts++
+				continue
+			}
 		}
 	}
 	return
				`@ -0,0 +1 @@`
				`ПРИМЕР ТЕКСТА ЗАГЛАВНЫМИ KOI8-r`
				`@ -0,0 +1 @@`
				`ПРИМЕР 1251 ТОЧНО ЖУЙ ЭТИ БУЛОЧКИ`