зеркало из https://github.com/softlandia/cpd.git
v0.2.3 add UTF32 w/o BOM and ValidUtf8
This commit is contained in:
Родитель
fbe3be3cbc
Коммит
45a5c66ef9
|
@ -5,7 +5,7 @@
|
|||
>download: go get -u github.com/softlandia/cpd
|
||||
>install: go install
|
||||
|
||||
библиотека на golang
|
||||
библиотека для golang
|
||||
|
||||
предназначена для автоматического определения кодовой страницы текстовых файлов или потоков байт
|
||||
поддерживает следующие кодовые страницы:
|
||||
|
|
Двоичные данные
char_frac.xlsx
Двоичные данные
char_frac.xlsx
Двоичный файл не отображается.
107
code_pages.go
107
code_pages.go
|
@ -66,6 +66,13 @@ func (o CodePage) MatchingRunes() string {
|
|||
//TCodepagesDic - type to store all supported code page
|
||||
type TCodepagesDic map[IDCodePage]CodePage
|
||||
|
||||
func (o TCodepagesDic) clearMatchCount() {
|
||||
for id, cp := range o {
|
||||
cp.countMatch = 0
|
||||
o[id] = cp
|
||||
}
|
||||
}
|
||||
|
||||
//Match - return the id of code page to which the data best matches
|
||||
func (o TCodepagesDic) Match(data []byte) (result IDCodePage) {
|
||||
result = ASCII
|
||||
|
@ -93,7 +100,7 @@ var CodepageDic = TCodepagesDic{
|
|||
//о е а и н т с р в
|
||||
{0xAE, 0}, {0xA5, 0}, {0xA0, 0}, {0xA8, 0}, {0xAD, 0}, {0xE2, 0}, {0xE1, 0}, {0xE0, 0}, {0xA2, 0},
|
||||
{0x8E, 0}, {0x85, 0}, {0x80, 0}, {0x88, 0}, {0x8D, 0}, {0x92, 0}, {0x91, 0}, {0x90, 0}, {0x82, 0}}},
|
||||
UTF8: {UTF8, "UTF8", MatchRes{0}, runesMatchUTF8,
|
||||
UTF8: {UTF8, "UTF-8", MatchRes{0}, runesMatchUTF8,
|
||||
codePageTable{
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
|
@ -102,10 +109,10 @@ var CodepageDic = TCodepagesDic{
|
|||
Windows1251: {Windows1251, "Windows1251", MatchRes{0}, runesMatch1251,
|
||||
codePageTable{
|
||||
{0, 0},
|
||||
//а и н с р в л к в
|
||||
{0xE0, 0}, {0xE8, 0}, {0xED, 0}, {0xF1, 0}, {0xF0, 0}, {0xE2, 0}, {0xEB, 0}, {0xEA, 0}, {0xE2, 0},
|
||||
{0xC0, 0}, {0xC8, 0}, {0xCD, 0}, {0xD1, 0}, {0xD0, 0}, {0xC2, 0}, {0xCB, 0}, {0xCA, 0}, {0xC2, 0}}},
|
||||
KOI8R: {KOI8R, "KOI8R", MatchRes{0}, runesMatchKOI8,
|
||||
//а и н с р в л к у
|
||||
{0xE0, 0}, {0xE8, 0}, {0xED, 0}, {0xF1, 0}, {0xF0, 0}, {0xE2, 0}, {0xEB, 0}, {0xEA, 0}, {0xF3, 0},
|
||||
{0xC0, 0}, {0xC8, 0}, {0xCD, 0}, {0xD1, 0}, {0xD0, 0}, {0xC2, 0}, {0xCB, 0}, {0xCA, 0}, {0xD3, 0}}},
|
||||
KOI8R: {KOI8R, "KOI8-R", MatchRes{0}, runesMatchKOI8,
|
||||
codePageTable{
|
||||
//о а и т с в л к м
|
||||
{0, 0},
|
||||
|
@ -117,21 +124,31 @@ var CodepageDic = TCodepagesDic{
|
|||
{0, 0},
|
||||
{0xDE, 0}, {0xD0, 0}, {0xD8, 0}, {0xE2, 0}, {0xE1, 0}, {0xD2, 0}, {0xDB, 0}, {0xDA, 0}, {0xD5, 0},
|
||||
{0xBF, 0}, {0xB0, 0}, {0xB8, 0}, {0xC2, 0}, {0xC1, 0}, {0xB2, 0}, {0xBB, 0}, {0xBA, 0}, {0xB5, 0}}},
|
||||
UTF16LE: {UTF16LE, "UTF16LE", MatchRes{0}, runesMatchUTF16LE,
|
||||
UTF16LE: {UTF16LE, "UTF-16LE", MatchRes{0}, runesMatchUTF16LE,
|
||||
codePageTable{
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0x3E04, 0}, {0x3504, 0}, {0x1004, 0}, {0x3804, 0}, {0x3D04, 0}, {0x4204, 0}, {0x4104, 0}, {0x4004, 0}, {0x3204, 0},
|
||||
{0x1E04, 0}, {0x1504, 0}, {0x3004, 0}, {0x1804, 0}, {0x1D04, 0}, {0x2204, 0}, {0x2104, 0}, {0x2004, 0}, {0x1204, 0}}},
|
||||
UTF16BE: {UTF16BE, "UTF16BE", MatchRes{0}, runesMatchUTF16BE,
|
||||
UTF16BE: {UTF16BE, "UTF-16BE", MatchRes{0}, runesMatchUTF16BE,
|
||||
codePageTable{
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0x043E, 0}, {0x0435, 0}, {0x0410, 0}, {0x0438, 0}, {0x043D, 0}, {0x0442, 0}, {0x0441, 0}, {0x0440, 0}, {0x0432, 0},
|
||||
{0x041E, 0}, {0x0415, 0}, {0x0430, 0}, {0x0418, 0}, {0x041D, 0}, {0x0422, 0}, {0x0421, 0}, {0x0420, 0}, {0x0412, 0}}},
|
||||
UTF32BE: {UTF32BE, "UTF-32BE", MatchRes{0}, runesMatchUTF32BE,
|
||||
codePageTable{
|
||||
{0, 0},
|
||||
{0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0},
|
||||
{0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}}},
|
||||
UTF32LE: {UTF32LE, "UTF-32LE", MatchRes{0}, runesMatchUTF32LE,
|
||||
codePageTable{
|
||||
{0, 0},
|
||||
{0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0},
|
||||
{0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}}},
|
||||
}
|
||||
|
||||
//codePageName - string of code page name
|
||||
//codePageName - string of code page name runesMatchUTF32LE
|
||||
var codePageName = map[IDCodePage]string{
|
||||
ASCII: "ASCII",
|
||||
ISOLatinCyrillic: "ISO-8859-5",
|
||||
|
@ -147,77 +164,3 @@ var codePageName = map[IDCodePage]string{
|
|||
UTF32LE: "UTF-32LE",
|
||||
UTF32BE: "UTF-32BE",
|
||||
}
|
||||
|
||||
/*
|
||||
//TCodePages - type for store all code page
|
||||
type TCodePages []CodePage
|
||||
|
||||
//Match - return IDCodePage
|
||||
//simple calculate count entry data runes in standart code page table
|
||||
func (o TCodePages) Match(data []byte) (result IDCodePage) {
|
||||
result = ASCII
|
||||
maxCount := 0
|
||||
for i, cp := range o {
|
||||
o[i].countMatch = cp.match(data, &o[i].table)
|
||||
if o[i].countMatch > maxCount {
|
||||
maxCount = o[i].countMatch
|
||||
result = cp.id
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
//DeepMach -
|
||||
func (o *TCodePages) DeepMach(data []byte) IDCodePage {
|
||||
return ASCII
|
||||
}
|
||||
|
||||
//CodePages - slice of code pages
|
||||
var CodePages = TCodePages{
|
||||
{ASCII, "ASCII", MatchRes{0}, runesMatchASCII,
|
||||
codePageTable{{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}},
|
||||
{IBM866, "IBM866", MatchRes{0}, runesMatch866,
|
||||
codePageTable{
|
||||
//first element serves as sign of absence
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0xAE, 0}, {0xA5, 0}, {0xA0, 0}, {0xA8, 0}, {0xAD, 0}, {0xE2, 0}, {0xE1, 0}, {0xE0, 0}, {0xA2, 0},
|
||||
{0x8E, 0}, {0x85, 0}, {0x80, 0}, {0x88, 0}, {0x8D, 0}, {0x92, 0}, {0x91, 0}, {0x90, 0}, {0x82, 0}}},
|
||||
{UTF8, "UTF8", MatchRes{0}, runesMatchUTF8,
|
||||
codePageTable{
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0xD0BE, 0}, {0xD0B5, 0}, {0xD0B0, 0}, {0xD0B8, 0}, {0xD0BD, 0}, {0xD182, 0}, {0xD181, 0}, {0xD180, 0}, {0xD0B2, 0},
|
||||
{0xD09E, 0}, {0xD095, 0}, {0xD090, 0}, {0xD098, 0}, {0xD0AD, 0}, {0xD0A2, 0}, {0xD0A1, 0}, {0xD0A0, 0}, {0xD092, 0}}},
|
||||
{Windows1251, "Windows1251", MatchRes{0}, runesMatch1251,
|
||||
codePageTable{
|
||||
{0, 0},
|
||||
//а и н с р в л к в
|
||||
{0xE0, 0}, {0xE8, 0}, {0xED, 0}, {0xF1, 0}, {0xF0, 0}, {0xE2, 0}, {0xEB, 0}, {0xEA, 0}, {0xE2, 0},
|
||||
{0xC0, 0}, {0xC8, 0}, {0xCD, 0}, {0xD1, 0}, {0xD0, 0}, {0xC2, 0}, {0xCB, 0}, {0xCA, 0}, {0xC2, 0}}},
|
||||
{KOI8R, "KOI8R", MatchRes{0}, runesMatchKOI8,
|
||||
codePageTable{
|
||||
//о а и т с в л к м
|
||||
{0, 0},
|
||||
{0xCF, 0}, {0xC1, 0}, {0xC9, 0}, {0xD4, 0}, {0xD3, 0}, {0xD7, 0}, {0xCC, 0}, {0xCB, 0}, {0xCD, 0},
|
||||
{0xEF, 0}, {0xE1, 0}, {0xE9, 0}, {0xF4, 0}, {0xF3, 0}, {0xF7, 0}, {0xEC, 0}, {0xEB, 0}, {0xED, 0}}},
|
||||
{ISOLatinCyrillic, "ISO-8859-5", MatchRes{0}, runesMatchISO88595,
|
||||
codePageTable{
|
||||
//о а и т с в л к е
|
||||
{0, 0},
|
||||
{0xDE, 0}, {0xD0, 0}, {0xD8, 0}, {0xE2, 0}, {0xE1, 0}, {0xD2, 0}, {0xDB, 0}, {0xDA, 0}, {0xD5, 0},
|
||||
{0xBF, 0}, {0xB0, 0}, {0xB8, 0}, {0xC2, 0}, {0xC1, 0}, {0xB2, 0}, {0xBB, 0}, {0xBA, 0}, {0xB5, 0}}},
|
||||
{UTF16LE, "UTF16LE", MatchRes{0}, runesMatchUTF16LE,
|
||||
codePageTable{
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0x3E04, 0}, {0x3504, 0}, {0x1004, 0}, {0x3804, 0}, {0x3D04, 0}, {0x4204, 0}, {0x4104, 0}, {0x4004, 0}, {0x3204, 0},
|
||||
{0x1E04, 0}, {0x1504, 0}, {0x3004, 0}, {0x1804, 0}, {0x1D04, 0}, {0x2204, 0}, {0x2104, 0}, {0x2004, 0}, {0x1204, 0}}},
|
||||
{UTF16BE, "UTF16BE", MatchRes{0}, runesMatchUTF16BE,
|
||||
codePageTable{
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0x043E, 0}, {0x0435, 0}, {0x0410, 0}, {0x0438, 0}, {0x043D, 0}, {0x0442, 0}, {0x0441, 0}, {0x0440, 0}, {0x0432, 0},
|
||||
{0x041E, 0}, {0x0415, 0}, {0x0430, 0}, {0x0418, 0}, {0x041D, 0}, {0x0422, 0}, {0x0421, 0}, {0x0420, 0}, {0x0412, 0}}},
|
||||
}
|
||||
*/
|
||||
|
|
6
cpd.go
6
cpd.go
|
@ -45,7 +45,11 @@ func CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error) {
|
|||
return ASCII, err
|
||||
}
|
||||
|
||||
//is buf contains the BOM of utf-8, utf-16le or utf-16be
|
||||
//возможно определение произойдёт по BOM или по валидности UTF тогда количество попаданий по кодовым страницам лучше занулить
|
||||
//на работу это повлиять не может, но если вывести статистику попаданий, то цифры с предыдущего определения
|
||||
CodepageDic.clearMatchCount()
|
||||
|
||||
//is buf contains the BOM of utf-8, utf-16le, utf-16be, utf-32le or utf-32be
|
||||
if idCodePage, ok := CheckBOM(buf); ok {
|
||||
return idCodePage, nil
|
||||
}
|
||||
|
|
31
cpd_test.go
31
cpd_test.go
|
@ -35,23 +35,28 @@ type tFileCodePageDetectTest struct {
|
|||
}
|
||||
|
||||
var dFileCodePageDetect = []tFileCodePageDetectTest{
|
||||
{"test_files\\utf32le-wBOM.txt", "", nil, UTF32LE}, //file contain utf32 little endian with bom
|
||||
{"test_files\\KOI8-r.txt", "", nil, KOI8R}, //file contain KOI8
|
||||
{"test_files\\IBM866.txt", "", nil, CP866}, //file contain IBM866
|
||||
{"test_files\\Win1251.txt", "", nil, Windows1251}, //file contain Windows1251
|
||||
{"test_files\\utf8-woBOM.txt", "", nil, UTF8}, //file contain utf8 with out bom rune at start
|
||||
{"test_files\\866&1251.txt", "", nil, Windows1251}, //file contain more 1251 then 866
|
||||
{"test_files\\noCodePage.txt", "", nil, UTF8}, //file contain rune only ASCII
|
||||
{"test_files\\empty_file.txt", "", nil, UTF8}, //file exist but empty, no error, return ASCII
|
||||
{"test_files\\IBM866.txt", "", nil, CP866}, //file contain IBM866
|
||||
{"test_files\\ISO8859-5.txt", "", nil, ISOLatinCyrillic}, //file contain ISO8859-5
|
||||
{"test_files\\KOI8-r.txt", "", nil, KOI8R}, //file contain KOI8
|
||||
{"test_files\\KOI8-r2.txt", "", nil, KOI8R}, //file contain KOI8
|
||||
{"test_files\\noCodePage.txt", "", nil, UTF8}, //file contain rune only ASCII
|
||||
{"test_files\\rune_encode_error.txt", "", nil, ASCII}, //file contain special rune -> encode error, but detect NO error
|
||||
{"test_files\\rune_error_1251.txt", "", nil, Windows1251}, //file contain 1251 and special rune -> encode error, but detect NO error
|
||||
{"test_files\\utf8wbom.txt", "", nil, UTF8}, //file contain utf8 with bom prefix
|
||||
{"test_files\\utf16LEwbom.txt", "", nil, UTF16LE}, //file contain utf16 little endian with BOM
|
||||
{"test_files\\utf16BEwbom.txt", "", nil, UTF16BE}, //file contain utf16 big endian with BOM
|
||||
{"test_files\\utf8.txt", "", nil, UTF8}, //file contain utf8 with out bom rune at start
|
||||
{"test_files\\utf8-wbom.txt", "", nil, UTF8}, //file contain utf8 with bom prefix
|
||||
{"test_files\\utf8-woBOM.txt", "", nil, UTF8}, //file contain utf8 with out bom rune at start
|
||||
{"test_files\\utf16be-wBOM.txt", "", nil, UTF16BE}, //file contain utf16 big endian with bom
|
||||
{"test_files\\utf16be-woBOM.txt", "", nil, UTF16BE}, //file contain utf16 big endian without bom
|
||||
{"test_files\\utf16le-wBOM.txt", "", nil, UTF16LE}, //file contain utf16 little endian with bom
|
||||
{"test_files\\utf16le-woBOM.txt", "", nil, UTF16LE}, //file contain utf16 little endian without bom
|
||||
{"test_files\\utf16be-woBOM.txt", "", nil, UTF16BE}, //file contain utf16 big endian without bom
|
||||
{"test_files\\utf32be-wBOM.txt", "", nil, UTF32BE}, //file contain utf32 big endian with bom
|
||||
{"test_files\\utf32be-woBOM.txt", "", nil, UTF32BE}, //file contain utf32 big endian without bom
|
||||
{"test_files\\utf32le-wBOM.txt", "", nil, UTF32LE}, //file contain utf32 little endian with bom
|
||||
{"test_files\\utf32le-woBOM.txt", "", nil, UTF32LE}, //file contain utf32 little endian without bom
|
||||
{"test_files\\Win1251.txt", "", nil, Windows1251}, //file contain Windows1251
|
||||
{"test_files\\Win1251Test.txt", "", nil, Windows1251}, //file contain Windows1251
|
||||
}
|
||||
|
||||
//FileCodePageDetect
|
||||
|
@ -123,12 +128,12 @@ func TestFileCodePageDetectSimple(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestFileCodePageDetectUtf8Bom(t *testing.T) {
|
||||
res, err := FileCodePageDetect("test_files\\utf8wbom.txt")
|
||||
res, err := FileCodePageDetect("test_files\\utf8-wBOM.txt")
|
||||
if err != nil {
|
||||
t.Errorf("<FileCodePageDetect()> on file 'utf8wbom.txt' err expected: nil, got: %s\n", err)
|
||||
t.Errorf("<FileCodePageDetect()> on file 'utf8-wBOM.txt' err expected: nil, got: %s\n", err)
|
||||
}
|
||||
if res != UTF8 {
|
||||
t.Errorf("<FileCodePageDetect()> on file 'utf8wbom.txt' expected: %s, got: %s\n", UTF8, res)
|
||||
t.Errorf("<FileCodePageDetect()> on file 'utf8-wBOM.txt' expected: %s, got: %s\n", UTF8, res)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
ПРИМЕР ТЕКСТА ЗАГЛАВНЫМИ KOI8-r
|
Двоичные данные
test_files/utf16BEwbom.txt
Двоичные данные
test_files/utf16BEwbom.txt
Двоичный файл не отображается.
Двоичные данные
test_files/utf16LEwbom.txt
Двоичные данные
test_files/utf16LEwbom.txt
Двоичный файл не отображается.
Двоичные данные
test_files/utf32be-woBOM.txt
Двоичные данные
test_files/utf32be-woBOM.txt
Двоичный файл не отображается.
Двоичные данные
test_files/utf32le-woBOM.txt
Двоичные данные
test_files/utf32le-woBOM.txt
Двоичный файл не отображается.
|
@ -1 +1 @@
|
|||
Русский в кодировке UTF8
|
||||
Русский в кодировке UTF8 ンラ на японском
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
code page UTF8
|
||||
Русский
|
|
@ -0,0 +1 @@
|
|||
ПРИМЕР 1251 ТОЧНО ЖУЙ ЭТИ БУЛОЧКИ
|
|
@ -0,0 +1,21 @@
|
|||
package cpd
|
||||
|
||||
//UTF-32BE
|
||||
|
||||
//первые 2 байта практически всегда меньше вторых 2 байтов
|
||||
func runesMatchUTF32BE(d []byte, tbl *codePageTable) (counts int) {
|
||||
var (
|
||||
w1 int64
|
||||
w2 int64
|
||||
)
|
||||
for i := 0; i < len(d)-4; i += 4 {
|
||||
w1 = int64(d[i]) * int64(d[i+1])
|
||||
w2 = int64(d[i+2]) * int64(d[i+3])
|
||||
if w1 > w2 {
|
||||
counts = 0
|
||||
break
|
||||
}
|
||||
counts++
|
||||
}
|
||||
return
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
package cpd
|
||||
|
||||
//UTF-32LE
|
||||
|
||||
//первые 2 байта практически всегда больше вторых 2 байтов
|
||||
//TODO лучше попробовать оценить количество 0x0 байтов по отношению к общему и если их много, то только тогла определять LE/BE
|
||||
func runesMatchUTF32LE(d []byte, tbl *codePageTable) (counts int) {
|
||||
var (
|
||||
w1 int64
|
||||
w2 int64
|
||||
)
|
||||
for i := 0; i < len(d)-4; i += 4 {
|
||||
w1 = int64(d[i]) * int64(d[i+1])
|
||||
w2 = int64(d[i+2]) * int64(d[i+3])
|
||||
if w1 < w2 {
|
||||
counts = 0 //все первые должны быть больше, иначе это не UTF-32le
|
||||
break
|
||||
}
|
||||
counts++
|
||||
}
|
||||
return
|
||||
}
|
19
utf8.go
19
utf8.go
|
@ -4,18 +4,16 @@ import "encoding/binary"
|
|||
|
||||
//unit for UTF8
|
||||
|
||||
func runesMatchUTF8(data []byte, tbl *codePageTable) (counts int) {
|
||||
n := len(data)/2 - 1
|
||||
if n <= 0 {
|
||||
func runesMatchUTF8(d []byte, tbl *codePageTable) (counts int) {
|
||||
if len(d) <= 3 {
|
||||
return
|
||||
}
|
||||
for i := 0; i < n; i += 2 {
|
||||
t := data[i : i+2]
|
||||
for i := 0; i < len(d)-3; i++ {
|
||||
t := d[i : i+2]
|
||||
d := binary.BigEndian.Uint16(t)
|
||||
j := tbl.containsRune(rune(d))
|
||||
if j > 0 {
|
||||
(*tbl)[j].count++
|
||||
//counts++
|
||||
}
|
||||
if isUTF8(rune(d)) {
|
||||
counts++
|
||||
|
@ -63,16 +61,19 @@ func ValidUTF8(data []byte) bool {
|
|||
zerroByteCount++
|
||||
}
|
||||
n, cp := testUTF8bitPattern(data[i])
|
||||
//n - количество байт следующих за этим которые будут использоваться для отображения данных
|
||||
//n == 0 быть не может, это получается если битовая маска 1100 0000 -> для первого байта UTF-8 это не допустимо
|
||||
if n == 0 {
|
||||
return false
|
||||
}
|
||||
i++
|
||||
var j int32 = 1
|
||||
for ; j < n; j++ {
|
||||
if (data[j] & 0xC0) != 0x80 {
|
||||
for j = 1; j < n; j++ {
|
||||
//байты с данными должны иметь маску 10xx xxxx
|
||||
if (data[i] & 0xC0) != 0x80 {
|
||||
return false
|
||||
}
|
||||
cp = (cp << 6) | int32(data[j]&0x3F)
|
||||
cp = (cp << 6) | int32(data[i]&0x3F)
|
||||
i++
|
||||
}
|
||||
|
||||
|
|
25
win1251.go
25
win1251.go
|
@ -6,18 +6,9 @@ import "unicode"
|
|||
|
||||
func runesMatch1251(data []byte, tbl *codePageTable) (counts int) {
|
||||
for i := range data {
|
||||
if i < 2 {
|
||||
if i < 1 {
|
||||
continue
|
||||
}
|
||||
//case " Us" - separator_UPPER_symbol
|
||||
if unicode.IsPunct(rune(data[i-2])) && isUpper1251(rune(data[i-1])) {
|
||||
j := tbl.containsRune(rune(data[i]))
|
||||
if j > 0 {
|
||||
(*tbl)[j].count++
|
||||
counts++
|
||||
continue
|
||||
}
|
||||
}
|
||||
//case "ab" - counts only if symbols are arranged in pairs
|
||||
if is1251(rune(data[i-1])) {
|
||||
j := tbl.containsRune(rune(data[i]))
|
||||
|
@ -25,6 +16,20 @@ func runesMatch1251(data []byte, tbl *codePageTable) (counts int) {
|
|||
(*tbl)[j].count++
|
||||
counts++
|
||||
}
|
||||
continue
|
||||
}
|
||||
if i < 2 {
|
||||
continue
|
||||
}
|
||||
//case " Us" or ".Us" separator_UPPER_lower
|
||||
//IsPunct -
|
||||
if (unicode.IsPunct(rune(data[i-2])) || unicode.IsSpace(rune(data[i-2]))) && isUpper1251(rune(data[i-1])) {
|
||||
j := tbl.containsRune(rune(data[i]))
|
||||
if (j > 0) && (isLower1251(rune(data[i]))) {
|
||||
(*tbl)[j].count++
|
||||
counts++
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
|
|
Загрузка…
Ссылка в новой задаче