зеркало из https://github.com/softlandia/cpd.git
v0.3.5 minor updates
This commit is contained in:
Родитель
8637e21086
Коммит
b5a7b5dbc2
|
@ -18,3 +18,13 @@ _____________________________
|
|||
* add test for UTF16LE and UTF16BE without russian
|
||||
|
||||
_____________________________
|
||||
|
||||
## ver 0.3.5 // 2020.01.27 ##
|
||||
|
||||
* minor updates
|
||||
|
||||
### todo ###
|
||||
|
||||
1 test with multithreading __not__ pass,
|
||||
|
||||
_____________________________
|
|
@ -31,8 +31,8 @@ no ID Name uint16
|
|||
|
||||
при использовании golang 1.12.6 в проект добавляется код размером ~250 kB
|
||||
|
||||
ВНИМАНИЕ!
|
||||
файлы без BOM в кодировке UTF16le и UTF16be при отсутсвии русских букв опознаются не верно
|
||||
>__ВНИМАНИЕ!__
|
||||
>библиотека не поддерживает многопоточный режим
|
||||
|
||||
## зависимости ##
|
||||
|
||||
|
|
|
@ -21,15 +21,14 @@ no ID Name uint16
|
|||
9. UTF32LE: "UTF-32LE", 1019
|
||||
10. UTF32BE: "UTF-32BE", 1018
|
||||
|
||||
|
||||
## feature ##
|
||||
|
||||
encoding is determined both by the presence of the bom attribute and by heuristic
|
||||
if file contain only latin symbols from first half of code page, this file detected as UTF-8
|
||||
this is not a mistake, this is a completely correct statement
|
||||
|
||||
ATTANTION!
|
||||
files without specification on UTF16le and UTF16be not containing the Russian alphabet are not recognized correctly
|
||||
>__ATTANTION!__
|
||||
>library not support multithreading, I work...
|
||||
|
||||
## dependences ##
|
||||
|
||||
|
|
Двоичные данные
char_frac.xlsx
Двоичные данные
char_frac.xlsx
Двоичный файл не отображается.
101
code_pages.go
101
code_pages.go
|
@ -4,6 +4,8 @@ import (
|
|||
"bytes"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/softlandia/xlib"
|
||||
)
|
||||
|
||||
// IDCodePage - index of code page
|
||||
|
@ -34,17 +36,20 @@ func (i IDCodePage) DeleteBom(s string) (res string) {
|
|||
|
||||
// matcher - return struct MatchRes - two criterion
|
||||
// this function must be realised in each code page
|
||||
type matcher func(data []byte, tbl *codePageTable) MatchRes
|
||||
type matcher func(data []byte, tbl *cpTable) MatchRes
|
||||
|
||||
// container - return true if b contain in
|
||||
type container func(b byte) bool
|
||||
|
||||
type tableElement struct {
|
||||
code rune //rune (letter) of the alphabet that interests us
|
||||
count int //the number of these runes found in the text
|
||||
}
|
||||
|
||||
// codePageTable - stores 9 letters, we will look for them in the text
|
||||
// cpTable - stores 9 letters, we will look for them in the text
|
||||
// element with index 0 for the case of non-location
|
||||
// first 9 elements lowercase, second 9 elements uppercase
|
||||
type codePageTable [19]tableElement
|
||||
type cpTable [19]tableElement
|
||||
|
||||
// MatchRes - result criteria
|
||||
// countMatch - the number of letters founded in text
|
||||
|
@ -60,12 +65,14 @@ func (m MatchRes) String() string {
|
|||
|
||||
// CodePage - realize code page
|
||||
type CodePage struct {
|
||||
id IDCodePage //id of code page
|
||||
name string //name of code page
|
||||
MatchRes //count of matching
|
||||
match matcher //method for calculating the criteria for the proximity of input data to this code page
|
||||
Boms []byte //default BOM for this codepage
|
||||
table codePageTable //table of main alphabet rune of this code page, contain [code, count]
|
||||
id IDCodePage //id of code page
|
||||
name string //name of code page
|
||||
NumByte byte //number of byte using in codepage
|
||||
MatchRes //count of matching
|
||||
match matcher //method for calculating the criteria for the proximity of input data to this code page
|
||||
contain container //method return true if this codepage contain byte
|
||||
Boms []byte //default BOM for this codepage
|
||||
table cpTable //table of main alphabet rune of this code page, contain [code, count]
|
||||
}
|
||||
|
||||
func (o CodePage) String() string {
|
||||
|
@ -84,64 +91,76 @@ func (o CodePage) MatchingRunes() string {
|
|||
return sb.String()
|
||||
}
|
||||
|
||||
// FirstAlphabetPos - return position of first alphabet
|
||||
// возвращает позицию первого алфавитного символа данной кодировки встреченную в отсортированном массиве
|
||||
func (o CodePage) FirstAlphabetPos(d []byte) int {
|
||||
d = xlib.SortBytes(d)
|
||||
for i, b := range d {
|
||||
if o.contain(b) {
|
||||
return i
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// TCodepagesDic - type to store all supported code page
|
||||
type TCodepagesDic map[IDCodePage]CodePage
|
||||
|
||||
//CodepageDic - map of all codepage
|
||||
var CodepageDic = TCodepagesDic{
|
||||
ASCII: {ASCII, "ASCII", MatchRes{0, 0}, matchASCII, []byte{},
|
||||
codePageTable{{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}},
|
||||
ASCII: {ASCII, "ASCII", 0, MatchRes{0, 0}, matchASCII, isASCII, []byte{},
|
||||
cpTable{{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}},
|
||||
|
||||
CP866: {CP866, "CP866", MatchRes{0, 0}, match866, []byte{},
|
||||
codePageTable{
|
||||
CP866: {CP866, "CP866", 1, MatchRes{0, 0}, match866, is866, []byte{},
|
||||
cpTable{
|
||||
//first element serves as sign of absence
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0xAE, 0}, {0xA5, 0}, {0xA0, 0}, {0xA8, 0}, {0xAD, 0}, {0xE2, 0}, {0xE1, 0}, {0xE0, 0}, {0xA2, 0},
|
||||
{0x8E, 0}, {0x85, 0}, {0x80, 0}, {0x88, 0}, {0x8D, 0}, {0x92, 0}, {0x91, 0}, {0x90, 0}, {0x82, 0}}},
|
||||
CP1251: {CP1251, "CP1251", MatchRes{0, 0}, match1251, []byte{},
|
||||
codePageTable{
|
||||
CP1251: {CP1251, "CP1251", 1, MatchRes{0, 0}, match1251, is1251, []byte{},
|
||||
cpTable{
|
||||
{0, 0},
|
||||
//а и н с р в л к я
|
||||
{0xE0, 0}, {0xE8, 0}, {0xED, 0}, {0xF1, 0}, {0xF0, 0}, {0xE2, 0}, {0xEB, 0}, {0xEA, 0}, {0xFF, 0},
|
||||
{0xC0, 0}, {0xC8, 0}, {0xCD, 0}, {0xD1, 0}, {0xD0, 0}, {0xC2, 0}, {0xCB, 0}, {0xCA, 0}, {0xDF, 0}}},
|
||||
KOI8R: {KOI8R, "KOI8-R", MatchRes{0, 0}, matchKOI8, []byte{},
|
||||
codePageTable{
|
||||
KOI8R: {KOI8R, "KOI8-R", 1, MatchRes{0, 0}, matchKOI8, isKOI8, []byte{},
|
||||
cpTable{
|
||||
//о а и т с в л к м
|
||||
{0, 0},
|
||||
{0xCF, 0}, {0xC1, 0}, {0xC9, 0}, {0xD4, 0}, {0xD3, 0}, {0xD7, 0}, {0xCC, 0}, {0xCB, 0}, {0xCD, 0},
|
||||
{0xEF, 0}, {0xE1, 0}, {0xE9, 0}, {0xF4, 0}, {0xF3, 0}, {0xF7, 0}, {0xEC, 0}, {0xEB, 0}, {0xED, 0}}},
|
||||
ISOLatinCyrillic: {ISOLatinCyrillic, "ISO-8859-5", MatchRes{0, 0}, matchISO88595, []byte{},
|
||||
codePageTable{
|
||||
ISOLatinCyrillic: {ISOLatinCyrillic, "ISO-8859-5", 1, MatchRes{0, 0}, matchISO88595, isISO88595, []byte{},
|
||||
cpTable{
|
||||
//о а и т с в л к е
|
||||
{0, 0},
|
||||
{0xDE, 0}, {0xD0, 0}, {0xD8, 0}, {0xE2, 0}, {0xE1, 0}, {0xD2, 0}, {0xDB, 0}, {0xDA, 0}, {0xD5, 0},
|
||||
{0xBF, 0}, {0xB0, 0}, {0xB8, 0}, {0xC2, 0}, {0xC1, 0}, {0xB2, 0}, {0xBB, 0}, {0xBA, 0}, {0xB5, 0}}},
|
||||
UTF8: {UTF8, "UTF-8", MatchRes{0, 0}, matchUTF8, []byte{0xef, 0xbb, 0xbf},
|
||||
codePageTable{
|
||||
UTF8: {UTF8, "UTF-8", 4, MatchRes{0, 0}, matchUTF8, isASCII, []byte{0xef, 0xbb, 0xbf},
|
||||
cpTable{
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0xD0BE, 0}, {0xD0B5, 0}, {0xD0B0, 0}, {0xD0B8, 0}, {0xD0BD, 0}, {0xD182, 0}, {0xD181, 0}, {0xD180, 0}, {0xD0B2, 0},
|
||||
{0xD09E, 0}, {0xD095, 0}, {0xD090, 0}, {0xD098, 0}, {0xD0AD, 0}, {0xD0A2, 0}, {0xD0A1, 0}, {0xD0A0, 0}, {0xD092, 0}}},
|
||||
UTF16LE: {UTF16LE, "UTF-16LE", MatchRes{0, 0}, matchUTF16le, []byte{0xff, 0xfe},
|
||||
codePageTable{
|
||||
UTF16LE: {UTF16LE, "UTF-16LE", 2, MatchRes{0, 0}, matchUTF16le, isASCII, []byte{0xff, 0xfe},
|
||||
cpTable{
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0x3E04, 0}, {0x3504, 0}, {0x1004, 0}, {0x3804, 0}, {0x3D04, 0}, {0x4204, 0}, {0x4104, 0}, {0x4004, 0}, {0x3204, 0},
|
||||
{0x1E04, 0}, {0x1504, 0}, {0x3004, 0}, {0x1804, 0}, {0x1D04, 0}, {0x2204, 0}, {0x2104, 0}, {0x2004, 0}, {0x1204, 0}}},
|
||||
UTF16BE: {UTF16BE, "UTF-16BE", MatchRes{0, 0}, matchUTF16be, []byte{0xfe, 0xff},
|
||||
codePageTable{
|
||||
UTF16BE: {UTF16BE, "UTF-16BE", 2, MatchRes{0, 0}, matchUTF16be, isASCII, []byte{0xfe, 0xff},
|
||||
cpTable{
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0x043E, 0}, {0x0435, 0}, {0x0410, 0}, {0x0438, 0}, {0x043D, 0}, {0x0442, 0}, {0x0441, 0}, {0x0440, 0}, {0x0432, 0},
|
||||
{0x041E, 0}, {0x0415, 0}, {0x0430, 0}, {0x0418, 0}, {0x041D, 0}, {0x0422, 0}, {0x0421, 0}, {0x0420, 0}, {0x0412, 0}}},
|
||||
UTF32BE: {UTF32BE, "UTF-32BE", MatchRes{0, 0}, matchUTF32be, []byte{0x00, 0x00, 0xfe, 0xff},
|
||||
codePageTable{
|
||||
UTF32BE: {UTF32BE, "UTF-32BE", 4, MatchRes{0, 0}, matchUTF32be, isASCII, []byte{0x00, 0x00, 0xfe, 0xff},
|
||||
cpTable{
|
||||
{0, 0},
|
||||
{0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0},
|
||||
{0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}}},
|
||||
UTF32LE: {UTF32LE, "UTF-32LE", MatchRes{0, 0}, matchUTF32le, []byte{0xff, 0xfe, 0x00, 0x00},
|
||||
codePageTable{
|
||||
UTF32LE: {UTF32LE, "UTF-32LE", 4, MatchRes{0, 0}, matchUTF32le, isASCII, []byte{0xff, 0xfe, 0x00, 0x00},
|
||||
cpTable{
|
||||
{0, 0},
|
||||
{0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0},
|
||||
{0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}}},
|
||||
|
@ -158,7 +177,6 @@ func (o TCodepagesDic) clear() {
|
|||
}
|
||||
|
||||
//Match - return the id of code page to which the data best matches
|
||||
//TODO большинству матчеров требуется более 2х символов, надо проверить на минимальную длину
|
||||
func (o TCodepagesDic) Match(data []byte) (result IDCodePage) {
|
||||
result = ASCII
|
||||
maxCount := 0
|
||||
|
@ -166,7 +184,7 @@ func (o TCodepagesDic) Match(data []byte) (result IDCodePage) {
|
|||
for id, cp := range o {
|
||||
cp.MatchRes = cp.match(data, &cp.table)
|
||||
o[id] = cp
|
||||
m = cp.countMatch + cp.countCvPairs
|
||||
m = cp.MatchRes.countMatch + cp.MatchRes.countCvPairs
|
||||
if m > maxCount {
|
||||
maxCount = m
|
||||
result = id
|
||||
|
@ -176,25 +194,10 @@ func (o TCodepagesDic) Match(data []byte) (result IDCodePage) {
|
|||
}
|
||||
|
||||
//foo function,
|
||||
func matchASCII(b []byte, tbl *codePageTable) MatchRes {
|
||||
func matchASCII(b []byte, tbl *cpTable) MatchRes {
|
||||
return MatchRes{0, 0}
|
||||
}
|
||||
|
||||
/*
|
||||
//codePageName - string of code page name runesMatchUTF32LE
|
||||
var codePageName = map[IDCodePage]string{
|
||||
ASCII: "ASCII",
|
||||
ISOLatinCyrillic: "ISO-8859-5",
|
||||
CP866: "CP866",
|
||||
CP1251: "CP1251",
|
||||
UTF8: "UTF-8",
|
||||
UTF16LE: "UTF-16LE",
|
||||
UTF16BE: "UTF-16BE",
|
||||
UTF32: "UTF-32",
|
||||
KOI8R: "KOI8-R",
|
||||
Unicode: "Unicode",
|
||||
UTF7: "UTF-7",
|
||||
UTF32LE: "UTF-32LE",
|
||||
UTF32BE: "UTF-32BE",
|
||||
func isASCII(b byte) bool {
|
||||
return true
|
||||
}
|
||||
*/
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
package cpd
|
||||
|
||||
import "sort"
|
||||
|
||||
//codePageTable
|
||||
|
||||
// return index of rune in code page table
|
||||
// return 0 if rune not in code page table
|
||||
func (t *codePageTable) index(r rune) int {
|
||||
func (t *cpTable) index(r rune) int {
|
||||
for j, e := range *t {
|
||||
if r == e.code {
|
||||
return j
|
||||
|
@ -13,17 +15,22 @@ func (t *codePageTable) index(r rune) int {
|
|||
return 0
|
||||
}
|
||||
|
||||
func (t *codePageTable) clear() {
|
||||
func (t *cpTable) clear() {
|
||||
for i := 0; i < len(t); i++ {
|
||||
t[i].count = 0
|
||||
}
|
||||
}
|
||||
|
||||
// founded - calculates total number of matching
|
||||
func (t *codePageTable) founded() (res int) {
|
||||
func (t *cpTable) founded() (res int) {
|
||||
//0 элемент исключён, он не содержит количество найденных букв
|
||||
for i := 1; i < len(t); i++ {
|
||||
res += t[i].count
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (t *cpTable) sort() *cpTable {
|
||||
sort.Slice(&t, func(i, j int) bool { return i < j })
|
||||
return t
|
||||
}
|
35
cpd_test.go
35
cpd_test.go
|
@ -5,6 +5,7 @@ import (
|
|||
"os"
|
||||
fp "path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
@ -21,10 +22,10 @@ var dStringHasBom = []tStringHasBom{
|
|||
{ASCII, "", false},
|
||||
{CP866, "CP866", false},
|
||||
{CP1251, string([]byte{0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), false},
|
||||
{CP1251, string([]byte{0xff, 0xfe, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), false}, //contain UTF16LE bom, false because CP1251 have no bom
|
||||
{UTF8, string([]byte{0xef, 0xbb, 0xbf, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), true},
|
||||
{UTF8, string([]byte{0xef, 0xbb, 0xbe, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), false},
|
||||
{UTF8, string([]byte{0xff, 0xbb, 0xbe, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), false},
|
||||
{CP1251, string([]byte{0xff, 0xfe, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), false}, //contain UTF16LE bom, false because CP1251 have no bom
|
||||
{UTF8, string([]byte{0xef, 0xbb, 0xbf, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), true}, //UTF8 with bom
|
||||
{UTF8, string([]byte{0xef, 0xbb, 0xbe, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), false}, //UTF8 without bom
|
||||
{UTF8, string([]byte{0xff, 0xbb, 0xbe, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), false}, //UTF8 without bom
|
||||
{UTF16BE, string([]byte{0xfe, 0xff, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), true},
|
||||
{UTF16LE, string([]byte{0xff, 0xfe, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), true},
|
||||
{UTF32BE, string([]byte{0x00, 0x00, 0xfe, 0xff, 0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), true},
|
||||
|
@ -72,6 +73,7 @@ type tFileCodePageDetectTest struct {
|
|||
}
|
||||
|
||||
var dFileCodePageDetect = []tFileCodePageDetectTest{
|
||||
{fp.Join("test_files/utf16le-woBOM-only-ru.txt"), "", nil, UTF16LE}, //file contain utf16 little endian without bom
|
||||
{fp.Join("test_files/utf16le-woBOM-no-ru.txt"), "", nil, UTF16LE}, //file contain utf16 little endian without bom
|
||||
{fp.Join("test_files/utf16le-woBOM-only-latin.txt"), "", nil, UTF16LE}, //file contain utf16 little endian without bom
|
||||
{fp.Join("test_files/utf16le_las.txt"), "", nil, UTF16LE}, //file contain utf16 little endian without bom
|
||||
|
@ -100,6 +102,7 @@ var dFileCodePageDetect = []tFileCodePageDetectTest{
|
|||
{fp.Join("test_files/win1251_upper.txt"), "", nil, CP1251}, //file contain Windows1251
|
||||
{fp.Join("test_files/utf16be-woBOM-only-latin.txt"), "", nil, UTF16BE}, //file contain utf16 big endian with bom
|
||||
{fp.Join("test_files/utf16be-woBOM-no-ru.txt"), "", nil, UTF16BE}, //file contain utf16 big endian with bom
|
||||
{fp.Join("test_files/utf16be-woBOM-only-ru.txt"), "", nil, UTF16BE}, //file contain utf16 big endian with bom
|
||||
}
|
||||
|
||||
//FileCodePageDetect
|
||||
|
@ -121,6 +124,30 @@ func TestFileCodePageDetect(t *testing.T) {
|
|||
assert.NotNil(t, err, "<FileCodePageDetect> on file '' must return error, but return nil")
|
||||
}
|
||||
|
||||
func fileCodepageDetect(wg *sync.WaitGroup, cp *[]IDCodePage, fileName string) {
|
||||
defer wg.Done()
|
||||
res, _ := FileCodePageDetect(fileName)
|
||||
(*cp) = append((*cp), res)
|
||||
}
|
||||
|
||||
/*
|
||||
func TestFileCodePageDetectM(t *testing.T) {
|
||||
var (
|
||||
res IDCodePage
|
||||
cp []IDCodePage
|
||||
wg sync.WaitGroup
|
||||
)
|
||||
cp = make([]IDCodePage, 0)
|
||||
for _, d := range dFileCodePageDetect {
|
||||
wg.Add(1)
|
||||
go fileCodepageDetect(&wg, &cp, d.fn)
|
||||
}
|
||||
wg.Wait()
|
||||
for i, d := range dFileCodePageDetect {
|
||||
assert.Equal(t, cp[i], d.r, fmt.Sprintf("<FileCodePageDetect> on file '%s' expected result: %s, got: %s", d.fn, d.r, res))
|
||||
}
|
||||
}*/
|
||||
|
||||
//TestCodePageDetect - тестирование метода CodePageDetect
|
||||
// проверки на входные параметры:
|
||||
// 1. nil входящий поток явный nil, параметр останова отсутствует
|
||||
|
|
|
@ -3,7 +3,7 @@ package cpd
|
|||
//unit for ibm866
|
||||
|
||||
// for CP866 calculate only count of letter from table 'tbl'
|
||||
func match866(data []byte, tbl *codePageTable) MatchRes {
|
||||
func match866(data []byte, tbl *cpTable) MatchRes {
|
||||
for i := range data {
|
||||
j := tbl.index(rune(data[i])) //return 0 if rune data[i] not found
|
||||
(*tbl)[j].count++
|
||||
|
@ -11,7 +11,6 @@ func match866(data []byte, tbl *codePageTable) MatchRes {
|
|||
return MatchRes{tbl.founded(), 0}
|
||||
}
|
||||
|
||||
/*
|
||||
const (
|
||||
cp866StartUpperChar = 0x80
|
||||
cp866StopUpperChar = 0x9F
|
||||
|
@ -33,4 +32,3 @@ func isLower866(r byte) bool {
|
|||
func is866(r byte) bool {
|
||||
return isUpper866(r) || isLower866(r)
|
||||
}
|
||||
*/
|
||||
|
|
|
@ -2,13 +2,13 @@ package cpd
|
|||
|
||||
//unit for ISO-8859-5
|
||||
|
||||
func matchISO88595(d []byte, tbl *codePageTable) MatchRes {
|
||||
func matchISO88595(d []byte, tbl *cpTable) MatchRes {
|
||||
for i := 0; i < len(d); i++ {
|
||||
if isISO88595(rune(d[i])) {
|
||||
if isISO88595(d[i]) {
|
||||
upper := lu88595(d[i])
|
||||
j := tbl.index(rune(d[i]))
|
||||
(*tbl)[j].count++
|
||||
for i++; (i < len(d)) && isISO88595(rune(d[i])); i++ {
|
||||
for i++; (i < len(d)) && isISO88595(d[i]); i++ {
|
||||
if upper >= lu88595(d[i]) {
|
||||
j = tbl.index(rune(d[i]))
|
||||
(*tbl)[j].count++
|
||||
|
@ -27,20 +27,20 @@ const (
|
|||
)
|
||||
|
||||
func lu88595(r byte) (res int) {
|
||||
if isUpperISO88595(rune(r)) {
|
||||
if isUpperISO88595(r) {
|
||||
res = 1
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func isUpperISO88595(r rune) bool {
|
||||
func isUpperISO88595(r byte) bool {
|
||||
return (r >= cpISO88595BeginUpperChar) && (r <= cpISO88595StopUpperChar)
|
||||
}
|
||||
|
||||
func isLowerISO88595(r rune) bool {
|
||||
func isLowerISO88595(r byte) bool {
|
||||
return (r >= cpISO88595BeginLowerChar) && (r <= cpISO88595StopLowerChar)
|
||||
}
|
||||
|
||||
func isISO88595(r rune) bool {
|
||||
func isISO88595(r byte) bool {
|
||||
return isUpperISO88595(r) || isLowerISO88595(r)
|
||||
}
|
||||
|
|
4
koi8.go
4
koi8.go
|
@ -42,7 +42,7 @@ var vowelsKOI8 = [256]byte{
|
|||
/* F */ 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
|
||||
}
|
||||
|
||||
func matchKOI8(d []byte, tbl *codePageTable) MatchRes {
|
||||
func matchKOI8(d []byte, tbl *cpTable) MatchRes {
|
||||
return MatchRes{matchRuneKOI8(d, tbl), cvPairsKOI8(d)}
|
||||
}
|
||||
|
||||
|
@ -58,7 +58,7 @@ func cvPairsKOI8(d []byte) (cvPairsCount int) {
|
|||
return cvPairsCount
|
||||
}
|
||||
|
||||
func matchRuneKOI8(d []byte, tbl *codePageTable) int {
|
||||
func matchRuneKOI8(d []byte, tbl *cpTable) int {
|
||||
for i := 0; i < len(d); i++ {
|
||||
if isKOI8(d[i]) {
|
||||
upper := luKOI8(d[i])
|
||||
|
|
|
@ -1,16 +1,52 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/softlandia/cpd"
|
||||
)
|
||||
|
||||
func main() {
|
||||
t, _ := cpd.FileCodePageDetect(os.Args[1])
|
||||
fmt.Printf("cpd.FileCodePageDetect():\t%s\n", t)
|
||||
for id, cp := range cpd.CodepageDic {
|
||||
fmt.Printf("%s\tmatches:%s\t%s\n", id, cp.MatchRes, cp.MatchingRunes())
|
||||
var fl []string
|
||||
FindFilesExt(&fl, ".\\", os.Args[1])
|
||||
for _, fn := range fl {
|
||||
t, _ := cpd.FileCodePageDetect(fn)
|
||||
fmt.Printf("file: \t`%s`\t`%s`\n", fn, t)
|
||||
}
|
||||
}
|
||||
|
||||
//FindFilesExt - search all files in path with 'ext' & put to list
|
||||
//path - "c:\tmp"
|
||||
//ext - ".log"
|
||||
//sample: n, err := FindFilesExt(&fl, "c:\\tmp", ".log")
|
||||
func FindFilesExt(fileList *[]string, path, fileNameExt string) (int, error) {
|
||||
if fileList == nil {
|
||||
return 0, errors.New("first parameter 'fileList' is nil")
|
||||
}
|
||||
extFile := strings.ToUpper(fileNameExt)
|
||||
i := 0 //index founded files
|
||||
err := filepath.Walk(path, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
log.Printf("prevent panic by handling failure accessing a path %q: %v\n", path, err)
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
//skip folders
|
||||
return nil
|
||||
}
|
||||
if strings.ToUpper(filepath.Ext(path)) != extFile {
|
||||
//skip folders and files with extention not extFile
|
||||
return nil
|
||||
}
|
||||
//file found
|
||||
i++
|
||||
*fileList = append(*fileList, path)
|
||||
return nil
|
||||
})
|
||||
return i, err
|
||||
}
|
||||
|
|
Двоичные данные
sample/sample.exe
Двоичные данные
sample/sample.exe
Двоичный файл не отображается.
|
@ -0,0 +1,15 @@
|
|||
ÚÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄ¿
|
||||
³ <20>ਬ¥à ¢ à ¬ª¥ ³
|
||||
³ ³
|
||||
³ ÉÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍ» ³
|
||||
³ º º ³
|
||||
³ º º ³
|
||||
³ º º ³
|
||||
³ º º ³
|
||||
³ º º ³
|
||||
³ º º ³
|
||||
³ ÈÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍÍͼ ³
|
||||
³ ³
|
||||
³ ³
|
||||
ÀÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÄÙ
|
||||
|
|
@ -8,7 +8,7 @@ import (
|
|||
|
||||
//unit for UTF16BE
|
||||
|
||||
func matchUTF16be(b []byte, tbl *codePageTable) MatchRes {
|
||||
func matchUTF16be(b []byte, tbl *cpTable) MatchRes {
|
||||
n := len(b)/2 - 1
|
||||
if n <= 0 {
|
||||
return MatchRes{0, 0}
|
||||
|
@ -34,7 +34,7 @@ func matchUTF16beZerro(b []byte) int {
|
|||
|
||||
// matchUTF16beRu - вычисляет критерий по количеству русских букв
|
||||
// tbl *codePageTable - передаётся не для нахождения кодировки, а для заполнения встречаемости популярных русских букв
|
||||
func matchUTF16beRu(data []byte, tbl *codePageTable) int {
|
||||
func matchUTF16beRu(data []byte, tbl *cpTable) int {
|
||||
matches := 0
|
||||
n := len(data)/2 - 1
|
||||
if n <= 0 {
|
||||
|
|
|
@ -13,11 +13,15 @@ import (
|
|||
//проверка на BOM уже выполнена, в принимаемом массиве не BOM символов
|
||||
|
||||
// matchUTF16le - функция вычисляет общий критерий для кодировки UTF16LE
|
||||
func matchUTF16le(b []byte, tbl *codePageTable) MatchRes {
|
||||
func matchUTF16le(b []byte, tbl *cpTable) MatchRes {
|
||||
n := len(b)/2 - 1
|
||||
if n <= 0 {
|
||||
return MatchRes{0, 0}
|
||||
}
|
||||
//два критерия используется
|
||||
//первый количество найденных русских букв
|
||||
//второй количество найденных 0x00
|
||||
//решающим является максимальный
|
||||
return MatchRes{xlib.Max(matchUTF16leRu(b, tbl), matchUTF16leZerro(b)), 0}
|
||||
}
|
||||
|
||||
|
@ -35,7 +39,7 @@ func matchUTF16leZerro(b []byte) int {
|
|||
|
||||
// matchUTF16leRu - вычисляет критерий по количеству русских букв
|
||||
// tbl *codePageTable - передаётся не для нахождения кодировки, а для заполнения встречаемости популярных русских букв
|
||||
func matchUTF16leRu(b []byte, tbl *codePageTable) int {
|
||||
func matchUTF16leRu(b []byte, tbl *cpTable) int {
|
||||
matches := 0
|
||||
count04 := 0
|
||||
n := len(b)/2 - 1
|
||||
|
|
|
@ -3,7 +3,7 @@ package cpd
|
|||
//UTF-32BE
|
||||
|
||||
//первые 2 байта практически всегда меньше вторых 2 байтов
|
||||
func matchUTF32be(d []byte, tbl *codePageTable) MatchRes {
|
||||
func matchUTF32be(d []byte, tbl *cpTable) MatchRes {
|
||||
zerroCounts := 0
|
||||
for i := 0; i < len(d)-4; i += 4 {
|
||||
if (int(d[i]) + int(d[i+1])) == 0 {
|
||||
|
|
|
@ -4,7 +4,7 @@ package cpd
|
|||
|
||||
//вторые 2 байта практически всегда 0
|
||||
//используемый признак не сработает если больше половины текста будет набрано символами с 4 значащими байтами, не представляю, что это за текст...
|
||||
func matchUTF32le(d []byte, tbl *codePageTable) MatchRes {
|
||||
func matchUTF32le(d []byte, tbl *cpTable) MatchRes {
|
||||
zerroCounts := 0
|
||||
for i := 0; i < len(d)-4; i += 4 {
|
||||
if (int(d[i+2]) + int(d[i+3])) == 0 {
|
||||
|
|
2
utf8.go
2
utf8.go
|
@ -4,7 +4,7 @@ import "encoding/binary"
|
|||
|
||||
//unit for UTF8
|
||||
|
||||
func matchUTF8(d []byte, tbl *codePageTable) MatchRes {
|
||||
func matchUTF8(d []byte, tbl *cpTable) MatchRes {
|
||||
matches := 0
|
||||
if len(d) <= 3 {
|
||||
return MatchRes{matches, 0}
|
||||
|
|
40
win1251.go
40
win1251.go
|
@ -56,7 +56,7 @@ func cvPairs1251(d []byte) (cvPairsCount int) {
|
|||
}
|
||||
|
||||
// matchRunes1251 - counts the number of characters that are the most popular letters of the Russian alphabet
|
||||
func matchRune1251(d []byte, tbl *codePageTable) int {
|
||||
func matchRune1251(d []byte, tbl *cpTable) int {
|
||||
for i := 0; i < len(d); i++ {
|
||||
if is1251(d[i]) {
|
||||
upper := lu1251(d[i])
|
||||
|
@ -73,7 +73,7 @@ func matchRune1251(d []byte, tbl *codePageTable) int {
|
|||
return tbl.founded()
|
||||
}
|
||||
|
||||
func match1251(d []byte, tbl *codePageTable) MatchRes {
|
||||
func match1251(d []byte, tbl *cpTable) MatchRes {
|
||||
return MatchRes{matchRune1251(d, tbl), cvPairs1251(d)}
|
||||
}
|
||||
|
||||
|
@ -103,3 +103,39 @@ func isLower1251(r byte) bool {
|
|||
func is1251(r byte) bool {
|
||||
return isUpper1251(r) || isLower1251(r)
|
||||
}
|
||||
|
||||
/*var = [66]byte{
|
||||
0xE0, 0xC0, // 'а'
|
||||
0xE1, 0xC1, // 'б'
|
||||
0xE2, 0xC2, // 'в'
|
||||
0xE3, 0xC3, // 'г'
|
||||
0xE4, 0xC4, // 'д'
|
||||
0xE5, 0xC5, // 'е'
|
||||
0xB8, 0xA8, // 'ё'
|
||||
0xE6, 0xC6, // 'ж'
|
||||
0xE7, 0xC7, // 'з'
|
||||
0xE8, 0xC8, // 'и'
|
||||
0xE9, 0xC9, // 'й'
|
||||
0xEA, 0xCA, // 'к'
|
||||
0xEB, 0xCB, // 'л'
|
||||
0xEC, 0xCC, // 'м'
|
||||
0xED, 0xCD, // 'н'
|
||||
0xEE, 0xCE, // 'о'
|
||||
0xEF, 0xCF, // 'п'
|
||||
0xF0, 0xD0, // 'р'
|
||||
0xF1, 0xD1, // 'с'
|
||||
0xF2, 0xD2, // 'т'
|
||||
0xF3, 0xD3, // 'у'
|
||||
0xF4, 0xD4, // 'ф'
|
||||
0xF5, 0xD5, // 'х'
|
||||
0xF6, 0xD6, // 'ц'
|
||||
0xF7, 0xD7, // 'ч'
|
||||
0xF8, 0xD8, // 'ш'
|
||||
0xF9, 0xD9, // 'щ'
|
||||
0xFA, 0xDA, // 'ъ'
|
||||
0xFB, 0xDB, // 'ы'
|
||||
0xFC, 0xDC, // 'ь'
|
||||
0xFD, 0xDD, // 'э'
|
||||
0xFE, 0xDE, // 'ю'
|
||||
0xFF, 0xDF, // 'я'
|
||||
}*/
|
||||
|
|
Загрузка…
Ссылка в новой задаче