зеркало из https://github.com/softlandia/cpd.git
v0.4.0 multithreading support
This commit is contained in:
Родитель
492f173dc3
Коммит
a1cf46e7d7
17
HIST.md
17
HIST.md
|
@ -9,7 +9,7 @@
|
|||
|
||||
### todo ###
|
||||
|
||||
1 UTF16LE & UTF16BE not recognized correctly if file no contains russian characters
|
||||
- UTF16LE & UTF16BE not recognized correctly if file no contains russian characters
|
||||
_____________________________
|
||||
|
||||
## ver 0.3.4 // 2020.01.17 ##
|
||||
|
@ -25,6 +25,19 @@ _____________________________
|
|||
|
||||
### todo ###
|
||||
|
||||
1 test with multithreading __not__ pass,
|
||||
- test with multithreading __not__ pass,
|
||||
|
||||
_____________________________
|
||||
|
||||
## ver 0.4.0 // 2020.01.29 ##
|
||||
|
||||
* multithreading support updates
|
||||
* add multithreading tests
|
||||
* rename exported functions
|
||||
* hide global var CodepageDic from export, rename to codepageDic
|
||||
|
||||
### todo ###
|
||||
|
||||
- string UTF32 w/o bom and w/o russian char detect as UTF16
|
||||
|
||||
_____________________________
|
||||
|
|
|
@ -27,16 +27,17 @@ no ID Name uint16
|
|||
|
||||
определение делается как по наличию признака BOM в начале файла так и по эвристическому алгоритму
|
||||
если данные содержат только латинские символы (первая половина ASCII таблицы) будет определена кодировка UTF-8
|
||||
это не является ошибкой, поскольку такой файл или данные действительно можно корректно интерпретировать как UTF-8
|
||||
это не является ошибкой, поскольку такой файл или данные действительно можно корректно интерпретировать как UTF-8
|
||||
возможно некорректное определение файлов в кодировке UTF-32 не содержащих русских символов
|
||||
|
||||
>__ВНИМАНИЕ!__
|
||||
>библиотека не поддерживает многопоточный режим
|
||||
>библиотека __поддерживает__ многопоточный режим
|
||||
|
||||
## зависимости ##
|
||||
|
||||
>"golang.org/x/text/encoding/charmap"
|
||||
>"golang.org/x/text/transform"
|
||||
>"github.com/softlandia/xlib"
|
||||
|
||||
|
||||
## типы ##
|
||||
|
||||
|
|
|
@ -25,16 +25,16 @@ no ID Name uint16
|
|||
|
||||
encoding is determined both by the presence of the bom attribute and by heuristic
|
||||
if file contain only latin symbols from first half of code page, this file detected as UTF-8
|
||||
this is not a mistake, this is a completely correct statement
|
||||
this is not a mistake, this is a completely correct statement
|
||||
have touble with detecting UTF32 without russians char
|
||||
|
||||
>__ATTANTION!__
|
||||
>library not support multithreading, I work...
|
||||
>library __support__ multithreading
|
||||
|
||||
## dependences ##
|
||||
|
||||
>"golang.org/x/text/encoding/charmap"
|
||||
>"golang.org/x/text/transform"
|
||||
>"github.com/softlandia/xlib"
|
||||
|
||||
## types ##
|
||||
|
||||
|
|
136
code_pages.go
136
code_pages.go
|
@ -4,8 +4,6 @@ import (
|
|||
"bytes"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/softlandia/xlib"
|
||||
)
|
||||
|
||||
// IDCodePage - index of code page
|
||||
|
@ -14,22 +12,22 @@ type IDCodePage uint16
|
|||
|
||||
func (i IDCodePage) String() string {
|
||||
//return codePageName[i]
|
||||
return CodepageDic[i].name
|
||||
return codepageDic[i].name
|
||||
}
|
||||
|
||||
//StringHasBom - return true if input string has BOM prefix
|
||||
func (i IDCodePage) StringHasBom(s string) bool {
|
||||
if len(CodepageDic[i].Boms) == 0 {
|
||||
if len(codepageDic[i].Boms) == 0 {
|
||||
return false
|
||||
}
|
||||
return bytes.HasPrefix([]byte(s), CodepageDic[i].Boms)
|
||||
return bytes.HasPrefix([]byte(s), codepageDic[i].Boms)
|
||||
}
|
||||
|
||||
//DeleteBom - return string without prefix bom bytes
|
||||
func (i IDCodePage) DeleteBom(s string) (res string) {
|
||||
res = s
|
||||
if i.StringHasBom(s) {
|
||||
res = res[len(CodepageDic[i].Boms):]
|
||||
res = res[len(codepageDic[i].Boms):]
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
@ -94,7 +92,7 @@ func (o CodePage) MatchingRunes() string {
|
|||
// FirstAlphabetPos - return position of first alphabet
|
||||
// возвращает позицию первого алфавитного символа данной кодировки встреченную в отсортированном массиве
|
||||
func (o CodePage) FirstAlphabetPos(d []byte) int {
|
||||
d = xlib.SortBytes(d)
|
||||
d = sortBytes(d)
|
||||
for i, b := range d {
|
||||
if o.contain(b) {
|
||||
return i
|
||||
|
@ -106,8 +104,99 @@ func (o CodePage) FirstAlphabetPos(d []byte) int {
|
|||
// TCodepagesDic - type to store all supported code page
|
||||
type TCodepagesDic map[IDCodePage]CodePage
|
||||
|
||||
//CodepageDic - map of all codepage
|
||||
var CodepageDic = TCodepagesDic{
|
||||
// NewCodepageDic - create a new map by copying the global
|
||||
func NewCodepageDic() TCodepagesDic {
|
||||
return TCodepagesDic{
|
||||
ASCII: {ASCII, "ASCII", 0, MatchRes{0, 0}, matchASCII, isASCII, []byte{},
|
||||
cpTable{{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}},
|
||||
|
||||
CP866: {CP866, "CP866", 1, MatchRes{0, 0}, match866, is866, []byte{},
|
||||
cpTable{
|
||||
//first element serves as sign of absence
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0xAE, 0}, {0xA5, 0}, {0xA0, 0}, {0xA8, 0}, {0xAD, 0}, {0xE2, 0}, {0xE1, 0}, {0xE0, 0}, {0xA2, 0},
|
||||
{0x8E, 0}, {0x85, 0}, {0x80, 0}, {0x88, 0}, {0x8D, 0}, {0x92, 0}, {0x91, 0}, {0x90, 0}, {0x82, 0}}},
|
||||
CP1251: {CP1251, "CP1251", 1, MatchRes{0, 0}, match1251, is1251, []byte{},
|
||||
cpTable{
|
||||
{0, 0},
|
||||
//а и н с р в л к я
|
||||
{0xE0, 0}, {0xE8, 0}, {0xED, 0}, {0xF1, 0}, {0xF0, 0}, {0xE2, 0}, {0xEB, 0}, {0xEA, 0}, {0xFF, 0},
|
||||
{0xC0, 0}, {0xC8, 0}, {0xCD, 0}, {0xD1, 0}, {0xD0, 0}, {0xC2, 0}, {0xCB, 0}, {0xCA, 0}, {0xDF, 0}}},
|
||||
KOI8R: {KOI8R, "KOI8-R", 1, MatchRes{0, 0}, matchKOI8, isKOI8, []byte{},
|
||||
cpTable{
|
||||
//о а и т с в л к м
|
||||
{0, 0},
|
||||
{0xCF, 0}, {0xC1, 0}, {0xC9, 0}, {0xD4, 0}, {0xD3, 0}, {0xD7, 0}, {0xCC, 0}, {0xCB, 0}, {0xCD, 0},
|
||||
{0xEF, 0}, {0xE1, 0}, {0xE9, 0}, {0xF4, 0}, {0xF3, 0}, {0xF7, 0}, {0xEC, 0}, {0xEB, 0}, {0xED, 0}}},
|
||||
ISOLatinCyrillic: {ISOLatinCyrillic, "ISO-8859-5", 1, MatchRes{0, 0}, matchISO88595, isISO88595, []byte{},
|
||||
cpTable{
|
||||
//о а и т с в л к е
|
||||
{0, 0},
|
||||
{0xDE, 0}, {0xD0, 0}, {0xD8, 0}, {0xE2, 0}, {0xE1, 0}, {0xD2, 0}, {0xDB, 0}, {0xDA, 0}, {0xD5, 0},
|
||||
{0xBF, 0}, {0xB0, 0}, {0xB8, 0}, {0xC2, 0}, {0xC1, 0}, {0xB2, 0}, {0xBB, 0}, {0xBA, 0}, {0xB5, 0}}},
|
||||
UTF8: {UTF8, "UTF-8", 4, MatchRes{0, 0}, matchUTF8, isASCII, []byte{0xef, 0xbb, 0xbf},
|
||||
cpTable{
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0xD0BE, 0}, {0xD0B5, 0}, {0xD0B0, 0}, {0xD0B8, 0}, {0xD0BD, 0}, {0xD182, 0}, {0xD181, 0}, {0xD180, 0}, {0xD0B2, 0},
|
||||
{0xD09E, 0}, {0xD095, 0}, {0xD090, 0}, {0xD098, 0}, {0xD0AD, 0}, {0xD0A2, 0}, {0xD0A1, 0}, {0xD0A0, 0}, {0xD092, 0}}},
|
||||
UTF16LE: {UTF16LE, "UTF-16LE", 2, MatchRes{0, 0}, matchUTF16le, isASCII, []byte{0xff, 0xfe},
|
||||
cpTable{
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0x3E04, 0}, {0x3504, 0}, {0x1004, 0}, {0x3804, 0}, {0x3D04, 0}, {0x4204, 0}, {0x4104, 0}, {0x4004, 0}, {0x3204, 0},
|
||||
{0x1E04, 0}, {0x1504, 0}, {0x3004, 0}, {0x1804, 0}, {0x1D04, 0}, {0x2204, 0}, {0x2104, 0}, {0x2004, 0}, {0x1204, 0}}},
|
||||
UTF16BE: {UTF16BE, "UTF-16BE", 2, MatchRes{0, 0}, matchUTF16be, isASCII, []byte{0xfe, 0xff},
|
||||
cpTable{
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0x043E, 0}, {0x0435, 0}, {0x0410, 0}, {0x0438, 0}, {0x043D, 0}, {0x0442, 0}, {0x0441, 0}, {0x0440, 0}, {0x0432, 0},
|
||||
{0x041E, 0}, {0x0415, 0}, {0x0430, 0}, {0x0418, 0}, {0x041D, 0}, {0x0422, 0}, {0x0421, 0}, {0x0420, 0}, {0x0412, 0}}},
|
||||
UTF32BE: {UTF32BE, "UTF-32BE", 4, MatchRes{0, 0}, matchUTF32be, isASCII, []byte{0x00, 0x00, 0xfe, 0xff},
|
||||
cpTable{
|
||||
{0, 0},
|
||||
{0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0},
|
||||
{0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}}},
|
||||
UTF32LE: {UTF32LE, "UTF-32LE", 4, MatchRes{0, 0}, matchUTF32le, isASCII, []byte{0xff, 0xfe, 0x00, 0x00},
|
||||
cpTable{
|
||||
{0, 0},
|
||||
{0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0},
|
||||
{0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}}},
|
||||
}
|
||||
}
|
||||
|
||||
//befor detecting of code page need clear all counts
|
||||
//this not for correct run, this need only if we want get correct statistic
|
||||
func (o TCodepagesDic) clear() {
|
||||
for id, cp := range o {
|
||||
cp.MatchRes = MatchRes{0, 0}
|
||||
cp.table.clear()
|
||||
o[id] = cp
|
||||
}
|
||||
}
|
||||
|
||||
// Match - return the id of code page to which the data best matches
|
||||
// call function match of each codepage
|
||||
func (o TCodepagesDic) Match(data []byte) (result IDCodePage) {
|
||||
result = ASCII
|
||||
maxCount := 0
|
||||
m := 0
|
||||
for id, cp := range o {
|
||||
cp.MatchRes = cp.match(data, &cp.table)
|
||||
o[id] = cp
|
||||
m = cp.MatchRes.countMatch + cp.MatchRes.countCvPairs
|
||||
if m > maxCount {
|
||||
maxCount = m
|
||||
result = id
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// CodepageDic - global map of all codepage
|
||||
// used for support function
|
||||
var codepageDic = TCodepagesDic{
|
||||
ASCII: {ASCII, "ASCII", 0, MatchRes{0, 0}, matchASCII, isASCII, []byte{},
|
||||
cpTable{{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}},
|
||||
|
||||
|
@ -166,34 +255,7 @@ var CodepageDic = TCodepagesDic{
|
|||
{0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}, {0x0, 0}}},
|
||||
}
|
||||
|
||||
//befor detecting of code page need clear all counts
|
||||
//this not for correct run, this need only if we want get correct statistic
|
||||
func (o TCodepagesDic) clear() {
|
||||
for id, cp := range o {
|
||||
cp.MatchRes = MatchRes{0, 0}
|
||||
cp.table.clear()
|
||||
o[id] = cp
|
||||
}
|
||||
}
|
||||
|
||||
//Match - return the id of code page to which the data best matches
|
||||
func (o TCodepagesDic) Match(data []byte) (result IDCodePage) {
|
||||
result = ASCII
|
||||
maxCount := 0
|
||||
m := 0
|
||||
for id, cp := range o {
|
||||
cp.MatchRes = cp.match(data, &cp.table)
|
||||
o[id] = cp
|
||||
m = cp.MatchRes.countMatch + cp.MatchRes.countCvPairs
|
||||
if m > maxCount {
|
||||
maxCount = m
|
||||
result = id
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
//foo function,
|
||||
//foo function for default codepage ASCII
|
||||
func matchASCII(b []byte, tbl *cpTable) MatchRes {
|
||||
return MatchRes{0, 0}
|
||||
}
|
||||
|
|
12
cpTable.go
12
cpTable.go
|
@ -15,12 +15,6 @@ func (t *cpTable) index(r rune) int {
|
|||
return 0
|
||||
}
|
||||
|
||||
func (t *cpTable) clear() {
|
||||
for i := 0; i < len(t); i++ {
|
||||
t[i].count = 0
|
||||
}
|
||||
}
|
||||
|
||||
// founded - calculates total number of matching
|
||||
func (t *cpTable) founded() (res int) {
|
||||
//0 элемент исключён, он не содержит количество найденных букв
|
||||
|
@ -30,6 +24,12 @@ func (t *cpTable) founded() (res int) {
|
|||
return
|
||||
}
|
||||
|
||||
func (t *cpTable) clear() {
|
||||
for i := 0; i < len(t); i++ {
|
||||
t[i].count = 0
|
||||
}
|
||||
}
|
||||
|
||||
func (t *cpTable) sort() *cpTable {
|
||||
sort.Slice(&t, func(i, j int) bool { return i < j })
|
||||
return t
|
||||
|
|
95
cpd.go
95
cpd.go
|
@ -16,60 +16,47 @@ import (
|
|||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
//ReadBufSize - byte count for reading from file, func FileCodePageDetect()
|
||||
// ReadBufSize - byte count for reading from file, func FileCodePageDetect()
|
||||
var ReadBufSize int = 1024
|
||||
|
||||
//FileCodePageDetect - detect code page of text file
|
||||
func FileCodePageDetect(fn string, stopStr ...string) (IDCodePage, error) {
|
||||
// FileCodepageDetect - detect code page of text file
|
||||
func FileCodepageDetect(fn string, stopStr ...string) (IDCodePage, error) {
|
||||
|
||||
iFile, err := os.Open(fn)
|
||||
if err != nil {
|
||||
return ASCII, err
|
||||
}
|
||||
defer iFile.Close()
|
||||
|
||||
if len(stopStr) > 0 {
|
||||
return CodePageDetect(iFile, stopStr[0])
|
||||
}
|
||||
return CodePageDetect(iFile)
|
||||
return CodepageDetect(iFile)
|
||||
}
|
||||
|
||||
//CodePageDetect - detect code page of ascii data from reader 'r'
|
||||
func CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error) {
|
||||
//test input interfase
|
||||
// CodepageDetect - detect code page of ascii data from reader 'r'
|
||||
func CodepageDetect(r io.Reader) (IDCodePage, error) {
|
||||
if r == nil {
|
||||
return ASCII, nil
|
||||
}
|
||||
//make slice of byte from input reader
|
||||
buf, err := bufio.NewReader(r).Peek(ReadBufSize)
|
||||
if (err != nil) && (err != io.EOF) {
|
||||
return ASCII, err
|
||||
}
|
||||
|
||||
//clear all counts and matching result
|
||||
//CodepageDic - global var and need cleaning befor reuse
|
||||
CodepageDic.clear()
|
||||
|
||||
//match code page from BOM, support: utf-8, utf-16le, utf-16be, utf-32le or utf-32be
|
||||
if idCodePage, ok := CheckBOM(buf); ok {
|
||||
return idCodePage, nil
|
||||
}
|
||||
|
||||
if ValidUTF8(buf) {
|
||||
return UTF8, nil
|
||||
}
|
||||
|
||||
return CodePageAutoDetect(buf), nil
|
||||
return CodepageAutoDetect(buf), nil
|
||||
}
|
||||
|
||||
//CodePageAutoDetect - auto detect code page of input content
|
||||
func CodePageAutoDetect(content []byte) (result IDCodePage) {
|
||||
return CodepageDic.Match(content)
|
||||
// CodepageAutoDetect - auto detect code page of input content
|
||||
func CodepageAutoDetect(b []byte) IDCodePage {
|
||||
return NewCodepageDic().Match(b)
|
||||
}
|
||||
|
||||
//FileConvertCodePage - replace code page text file from one to another
|
||||
// FileConvertCodepage - replace code page text file from one to another
|
||||
// support convert only from/to Windows1251/IBM866
|
||||
func FileConvertCodePage(fileName string, fromCP, toCP IDCodePage) error {
|
||||
func FileConvertCodepage(fileName string, fromCP, toCP IDCodePage) error {
|
||||
if fromCP == toCP {
|
||||
return nil
|
||||
}
|
||||
|
@ -100,7 +87,7 @@ func FileConvertCodePage(fileName string, fromCP, toCP IDCodePage) error {
|
|||
iScanner := bufio.NewScanner(iFile)
|
||||
for i := 0; iScanner.Scan(); i++ {
|
||||
s = iScanner.Text()
|
||||
s, err = StrConvertCodePage(s, fromCP, toCP)
|
||||
s, err = StrConvertCodepage(s, fromCP, toCP)
|
||||
if err != nil {
|
||||
oFile.Close()
|
||||
os.Remove(tmpFileName)
|
||||
|
@ -113,20 +100,37 @@ func FileConvertCodePage(fileName string, fromCP, toCP IDCodePage) error {
|
|||
return os.Rename(tmpFileName, fileName)
|
||||
}
|
||||
|
||||
//ToUTF8 -
|
||||
//TODO need realization
|
||||
func ToUTF8(s string) string {
|
||||
return s
|
||||
}
|
||||
|
||||
//IsSeparator - return true if input rune is SPACE or PUNCT
|
||||
func IsSeparator(r rune) bool {
|
||||
return unicode.IsPunct(r) || unicode.IsSpace(r)
|
||||
}
|
||||
|
||||
//StrConvertCodePage - convert string from one code page to another
|
||||
// CodepageAsString - return name of char set with id codepage
|
||||
// if codepage not exist - return ""
|
||||
func CodepageAsString(codepage IDCodePage) string {
|
||||
return codepageDic[codepage].name
|
||||
}
|
||||
|
||||
//DecodeUTF16 - decode slice of byte from UTF16 to UTF8
|
||||
func DecodeUTF16(b []byte) string {
|
||||
if len(b)%2 != 0 {
|
||||
return string(b)
|
||||
}
|
||||
u16s := make([]uint16, 1)
|
||||
ret := &bytes.Buffer{}
|
||||
b8buf := make([]byte, 4)
|
||||
for i := 0; i < len(b); i += 2 {
|
||||
u16s[0] = uint16(b[i]) + (uint16(b[i+1]) << 8)
|
||||
r := utf16.Decode(u16s)
|
||||
n := utf8.EncodeRune(b8buf, r[0])
|
||||
ret.Write(b8buf[:n])
|
||||
}
|
||||
return ret.String()
|
||||
}
|
||||
|
||||
// StrConvertCodepage - convert string from one code page to another
|
||||
// function for future, at now support convert only from/to Windows1251/IBM866
|
||||
func StrConvertCodePage(s string, fromCP, toCP IDCodePage) (string, error) {
|
||||
func StrConvertCodepage(s string, fromCP, toCP IDCodePage) (string, error) {
|
||||
if len(s) == 0 {
|
||||
return "", nil
|
||||
}
|
||||
|
@ -150,26 +154,3 @@ func StrConvertCodePage(s string, fromCP, toCP IDCodePage) (string, error) {
|
|||
}
|
||||
return s, err
|
||||
}
|
||||
|
||||
// CodePageAsString - return name of char set with id codepage
|
||||
// if codepage not exist - return ""
|
||||
func CodePageAsString(codepage IDCodePage) string {
|
||||
return CodepageDic[codepage].name
|
||||
}
|
||||
|
||||
//DecodeUTF16 - decode slice of byte from UTF16 to UTF8
|
||||
func DecodeUTF16(b []byte) string {
|
||||
if len(b)%2 != 0 {
|
||||
return string(b)
|
||||
}
|
||||
u16s := make([]uint16, 1)
|
||||
ret := &bytes.Buffer{}
|
||||
b8buf := make([]byte, 4)
|
||||
for i := 0; i < len(b); i += 2 {
|
||||
u16s[0] = uint16(b[i]) + (uint16(b[i+1]) << 8)
|
||||
r := utf16.Decode(u16s)
|
||||
n := utf8.EncodeRune(b8buf, r[0])
|
||||
ret.Write(b8buf[:n])
|
||||
}
|
||||
return ret.String()
|
||||
}
|
||||
|
|
74
cpd_test.go
74
cpd_test.go
|
@ -53,7 +53,7 @@ var dCodePageAsString = []tCodePageAsString{
|
|||
|
||||
func TestCodePageAsString(t *testing.T) {
|
||||
for i, v := range dCodePageAsString {
|
||||
s := CodePageAsString(v.id)
|
||||
s := CodepageAsString(v.id)
|
||||
assert.Equal(t, v.s, s, fmt.Sprintf("<CodePageAsString> on test: %d return: %s, expected: %s", i, s, v.s))
|
||||
}
|
||||
}
|
||||
|
@ -67,7 +67,7 @@ func TestCodepageString(t *testing.T) {
|
|||
|
||||
type tFileCodePageDetectTest struct {
|
||||
fn string //filename
|
||||
st string //stop string
|
||||
st string //stop string, not using now
|
||||
e error //
|
||||
r IDCodePage //expected result
|
||||
}
|
||||
|
@ -100,9 +100,11 @@ var dFileCodePageDetect = []tFileCodePageDetectTest{
|
|||
{fp.Join("test_files/utf32le-woBOM.txt"), "", nil, UTF32LE}, //file contain utf32 little endian without bom
|
||||
{fp.Join("test_files/Win1251.txt"), "", nil, CP1251}, //file contain Windows1251
|
||||
{fp.Join("test_files/win1251_upper.txt"), "", nil, CP1251}, //file contain Windows1251
|
||||
{fp.Join("test_files/utf16be-woBOM-only-latin.txt"), "", nil, UTF16BE}, //file contain utf16 big endian with bom
|
||||
{fp.Join("test_files/utf16be-woBOM-no-ru.txt"), "", nil, UTF16BE}, //file contain utf16 big endian with bom
|
||||
{fp.Join("test_files/utf16be-woBOM-only-ru.txt"), "", nil, UTF16BE}, //file contain utf16 big endian with bom
|
||||
{fp.Join("test_files/utf16be-woBOM-only-latin.txt"), "", nil, UTF16BE}, //file contain utf16be with bom
|
||||
{fp.Join("test_files/utf16be-woBOM-no-ru.txt"), "", nil, UTF16BE}, //file contain utf16be with bom
|
||||
{fp.Join("test_files/utf16be-woBOM-only-ru.txt"), "", nil, UTF16BE}, //file contain utf16be with bom
|
||||
{fp.Join("test_files/utf32be-ascii-no-ru.txt"), "", nil, UTF16BE}, //file contain utf32be w/o bom w/o ru, detected as UTF16BE
|
||||
{fp.Join("test_files/utf32le-ascii-no-ru.txt"), "", nil, UTF16LE}, //file contain utf32le w/o bom w/o ru, detected as UTF16LE
|
||||
}
|
||||
|
||||
//FileCodePageDetect
|
||||
|
@ -112,41 +114,33 @@ func TestFileCodePageDetect(t *testing.T) {
|
|||
res IDCodePage
|
||||
)
|
||||
for _, d := range dFileCodePageDetect {
|
||||
res, err = FileCodePageDetect(d.fn)
|
||||
res, err = FileCodepageDetect(d.fn)
|
||||
assert.Equal(t, err, d.e, fmt.Sprintf("<FileCodePageDetect> on file '%s' expected error: '%v', got: '%v', ", d.fn, d.e, err))
|
||||
assert.Equal(t, res, d.r, fmt.Sprintf("<FileCodePageDetect> on file '%s' expected result: %s, got: %s", d.fn, d.r, res))
|
||||
}
|
||||
|
||||
_, err = FileCodePageDetect("-.-") //file "-.-" not exist
|
||||
_, err = FileCodepageDetect("-.-") //file "-.-" not exist
|
||||
assert.NotNil(t, err, "<FileCodePageDetect> on file '-.-' must return error, but return nil")
|
||||
|
||||
_, err = FileCodePageDetect("") //file "" not exist
|
||||
_, err = FileCodepageDetect("") //file "" not exist
|
||||
assert.NotNil(t, err, "<FileCodePageDetect> on file '' must return error, but return nil")
|
||||
}
|
||||
|
||||
func fileCodepageDetect(wg *sync.WaitGroup, cp *[]IDCodePage, fileName string) {
|
||||
func fileCodepageDetect(wg *sync.WaitGroup, t *testing.T, trusted IDCodePage, f string) {
|
||||
defer wg.Done()
|
||||
res, _ := FileCodePageDetect(fileName)
|
||||
(*cp) = append((*cp), res)
|
||||
res, _ := FileCodepageDetect(f)
|
||||
assert.Equal(t, trusted, res, fmt.Sprintf("<FileCodePageDetect> on file '%s' expected: %s, got: %s", f, trusted, res))
|
||||
}
|
||||
|
||||
/*
|
||||
//тестирование в многопоточном режиме
|
||||
func TestFileCodePageDetectM(t *testing.T) {
|
||||
var (
|
||||
res IDCodePage
|
||||
cp []IDCodePage
|
||||
wg sync.WaitGroup
|
||||
)
|
||||
cp = make([]IDCodePage, 0)
|
||||
var wg sync.WaitGroup
|
||||
for _, d := range dFileCodePageDetect {
|
||||
wg.Add(1)
|
||||
go fileCodepageDetect(&wg, &cp, d.fn)
|
||||
go fileCodepageDetect(&wg, t, d.r, d.fn)
|
||||
}
|
||||
wg.Wait()
|
||||
for i, d := range dFileCodePageDetect {
|
||||
assert.Equal(t, cp[i], d.r, fmt.Sprintf("<FileCodePageDetect> on file '%s' expected result: %s, got: %s", d.fn, d.r, res))
|
||||
}
|
||||
}*/
|
||||
}
|
||||
|
||||
//TestCodePageDetect - тестирование метода CodePageDetect
|
||||
// проверки на входные параметры:
|
||||
|
@ -155,70 +149,66 @@ func TestFileCodePageDetectM(t *testing.T) {
|
|||
// 3. входящий поток не инициализированный объект, проверка на передачу пустого интерфейса
|
||||
// проверка самой работы осуществляется через FileCodePageDetect()
|
||||
func TestCodePageDetect(t *testing.T) {
|
||||
tmp, err := CodePageDetect(nil)
|
||||
assert.Nil(t, err, fmt.Sprintf("<CodePageDetect> on input nil return error != nil\n"))
|
||||
assert.Equal(t, tmp, ASCII, fmt.Sprintf("<CodePageDetect> on input nil return code page != ASCII\n"))
|
||||
|
||||
tmp, err = CodePageDetect(nil, "~")
|
||||
tmp, err := CodepageDetect(nil)
|
||||
assert.Nil(t, err, fmt.Sprintf("<CodePageDetect> on input nil return error != nil\n"))
|
||||
assert.Equal(t, tmp, ASCII, fmt.Sprintf("<CodePageDetect> on input nil return code page != ASCII\n"))
|
||||
|
||||
var data *os.File
|
||||
res, err := CodePageDetect(data)
|
||||
res, err := CodepageDetect(data)
|
||||
assert.NotNil(t, err, fmt.Sprintf("<CodePageDetect> on empty io.Reader return error != nil, data: %+v, err: %v\n", data, err))
|
||||
assert.Equal(t, res, ASCII, fmt.Sprintf("<CodePageDetect> on empty io.Reader = %+v return code page %s != ASCII\n", data, res))
|
||||
|
||||
res, err = CodePageDetect(strings.NewReader(""))
|
||||
res, err = CodepageDetect(strings.NewReader(""))
|
||||
assert.Nil(t, err, fmt.Sprintf("<CodePageDetect> on input \"\" return error: %v, expected nil\n", err))
|
||||
assert.Equal(t, res, UTF8, fmt.Sprintf("<CodePageDetect> on input \"\" return %s, expected ASCII\n", res))
|
||||
}
|
||||
|
||||
//FileConvertCodePage
|
||||
func TestFileConvertCodePage(t *testing.T) {
|
||||
err := FileConvertCodePage("", CP866, CP1251)
|
||||
err := FileConvertCodepage("", CP866, CP1251)
|
||||
assert.NotNil(t, err, fmt.Sprintf("<FileConvertCodePage> on empty file name expected error, got: %v", err))
|
||||
|
||||
err = FileConvertCodePage("", CP866, CP866)
|
||||
err = FileConvertCodepage("", CP866, CP866)
|
||||
assert.Nil(t, err, fmt.Sprintf("<FileConvertCodePage> on fromCp == toCp expected error==nil, got: %v", err))
|
||||
|
||||
err = FileConvertCodePage("123", UTF8, CP866)
|
||||
err = FileConvertCodepage("123", UTF8, CP866)
|
||||
assert.Nil(t, err, fmt.Sprintf("<FileConvertCodePage> on fromCp or toCp not Windows1251 or IBM866 expected error == nil, got: %v", err))
|
||||
|
||||
err = FileConvertCodePage("123", CP866, UTF16LE)
|
||||
err = FileConvertCodepage("123", CP866, UTF16LE)
|
||||
assert.Nil(t, err, fmt.Sprintf("<FileConvertCodePage> on fromCp or toCp not Windows1251 or IBM866 expected error == nil, got: %v", err))
|
||||
|
||||
err = FileConvertCodePage(fp.Join("test_files/rune_encode_error.txt"), CP866, CP1251)
|
||||
err = FileConvertCodepage(fp.Join("test_files/rune_encode_error.txt"), CP866, CP1251)
|
||||
assert.NotNil(t, err, fmt.Sprintf("<FileConvertCodePage> expected error, got: %v", err))
|
||||
|
||||
os.Link(fp.Join("test_files/866to1251.txt"), fp.Join("test_files/866to1251.tmp"))
|
||||
err = FileConvertCodePage(fp.Join("test_files/866to1251.tmp"), CP866, CP1251)
|
||||
err = FileConvertCodepage(fp.Join("test_files/866to1251.tmp"), CP866, CP1251)
|
||||
assert.Nil(t, err, fmt.Sprintf("<FileConvertCodePage> expect no err, got: %v", err))
|
||||
os.Remove(fp.Join("test_files/866to1251.tmp"))
|
||||
}
|
||||
|
||||
//ConvertCodePage
|
||||
func TestStrConvertCodePage(t *testing.T) {
|
||||
s, err := StrConvertCodePage(string([]byte{0x91, 0xE2, 0xE0}), CP866, CP1251)
|
||||
s, err := StrConvertCodepage(string([]byte{0x91, 0xE2, 0xE0}), CP866, CP1251)
|
||||
assert.Nil(t, err, fmt.Sprintf("<StrConvertCodePage> on test 1 return err: %v unexpected nil", err))
|
||||
assert.Equal(t, s, string([]byte{0xD1, 0xF2, 0xF0}), fmt.Sprintf("<StrConvertCodePage> on test CP866->CP1251 return string: %s, expected: 'Стр'", s))
|
||||
|
||||
s, err = StrConvertCodePage(string([]byte{0xFF, 0xC9, 0xB8}), CP1251, CP866)
|
||||
s, err = StrConvertCodepage(string([]byte{0xFF, 0xC9, 0xB8}), CP1251, CP866)
|
||||
assert.Nil(t, err, fmt.Sprintf("<StrConvertCodePage> on test CP1251->CP866 return unexpected err: %v", err))
|
||||
assert.Equal(t, s, string([]byte{0xEF, 0x89, 0xF1}), fmt.Sprintf("<StrConvertCodePage> on test CP1251->CP866 return string: %s, expected: 'яЙё'", s))
|
||||
|
||||
s, err = StrConvertCodePage("", CP866, CP1251)
|
||||
s, err = StrConvertCodepage("", CP866, CP1251)
|
||||
assert.Nil(t, err, fmt.Sprintf("<StrConvertCodePage> with empty input string must return ERROR nil, but return: %v", err))
|
||||
assert.Equal(t, s, "", fmt.Sprintf("<StrConvertCodePage> with empty input string must return empty, but return: %s", err))
|
||||
|
||||
s, err = StrConvertCodePage("1234", CP866, CP866)
|
||||
s, err = StrConvertCodepage("1234", CP866, CP866)
|
||||
assert.Nil(t, err, fmt.Sprintf("<StrConvertCodePage> with equal fromCP and toCp must return nil, but retrurn: %v", err))
|
||||
assert.Equal(t, s, "1234", fmt.Sprintf("<StrConvertCodePage> with equal fromCP and toCp must return input string, but return: %s", s))
|
||||
|
||||
s, err = StrConvertCodePage(string([]byte{0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), CP1251, UTF8)
|
||||
s, err = StrConvertCodepage(string([]byte{0xD0, 0xEE, 0xF1, 0xF1, 0xE8, 0xFF}), CP1251, UTF8)
|
||||
assert.Nil(t, err, fmt.Sprintf("<StrConvertCodePage> from CP1251 to UTF expect nil, got: %v", err))
|
||||
assert.Equal(t, s, "Россия", fmt.Sprintf("<StrConvertCodePage> '%s' wrong return from CP1251 to UTF string", s))
|
||||
|
||||
s, err = StrConvertCodePage(string([]byte{0x90, 0xAE, 0xE1, 0xE1, 0xA8, 0xEF}), CP866, UTF8)
|
||||
s, err = StrConvertCodepage(string([]byte{0x90, 0xAE, 0xE1, 0xE1, 0xA8, 0xEF}), CP866, UTF8)
|
||||
assert.Nil(t, err, fmt.Sprintf("<StrConvertCodePage> from CP866 to UTF expect nil, got: %v", err))
|
||||
assert.Equal(t, s, "Россия", fmt.Sprintf("<StrConvertCodePage> '%s' wrong return from CP866 to UTF string", s))
|
||||
}
|
||||
|
|
13
linter.md
13
linter.md
|
@ -1,6 +1,13 @@
|
|||
cpd_test.go:166:9: Error return value of `os.Link` is not checked (errcheck)
|
||||
cpTable.go:33:19: func `(*cpTable).sort` is unused (unused)
|
||||
utf8.go:113:6: func `toUTF8` is unused (unused)
|
||||
code_pages.go:171:24: func `TCodepagesDic.clear` is unused (unused)
|
||||
cpTable.go:27:19: func `(*cpTable).clear` is unused (unused)
|
||||
sample\main.go:16:14: Error return value of `FindFilesExt` is not checked (errcheck)
|
||||
FindFilesExt(&fl, ".\\", os.Args[1])
|
||||
^
|
||||
cpd_test.go:183:9: Error return value of `os.Link` is not checked (errcheck)
|
||||
os.Link(fp.Join("test_files/866to1251.txt"), fp.Join("test_files/866to1251.tmp"))
|
||||
^
|
||||
cpd_test.go:69:2: `st` is unused (structcheck)
|
||||
st string //stop string
|
||||
cpd_test.go:70:2: `st` is unused (structcheck)
|
||||
st string //stop string, not using now
|
||||
^
|
||||
|
|
|
@ -15,7 +15,7 @@ func main() {
|
|||
var fl []string
|
||||
FindFilesExt(&fl, ".\\", os.Args[1])
|
||||
for _, fn := range fl {
|
||||
t, _ := cpd.FileCodePageDetect(fn)
|
||||
t, _ := cpd.FileCodepageDetect(fn)
|
||||
fmt.Printf("file: \t`%s`\t`%s`\n", fn, t)
|
||||
}
|
||||
}
|
||||
|
|
Двоичный файл не отображается.
Двоичный файл не отображается.
|
@ -2,8 +2,6 @@ package cpd
|
|||
|
||||
import (
|
||||
"encoding/binary"
|
||||
|
||||
"github.com/softlandia/xlib"
|
||||
)
|
||||
|
||||
//unit for UTF16BE
|
||||
|
@ -17,7 +15,7 @@ func matchUTF16be(b []byte, tbl *cpTable) MatchRes {
|
|||
//первый количество найденных русских букв
|
||||
//второй количество найденных 0x00
|
||||
//решающим является максимальный
|
||||
return MatchRes{xlib.Max(matchUTF16beRu(b, tbl), matchUTF16beZerro(b)), 0}
|
||||
return MatchRes{max(matchUTF16beRu(b, tbl), matchUTF16beZerro(b)), 0}
|
||||
}
|
||||
|
||||
// matchUTF16leZerro - вычисляет критерий по количеству нулевых байтов, текст набранный латинскими символами в колировке UTF16le будет вторым символом иметь 0x00
|
||||
|
|
26
utf16le.go
26
utf16le.go
|
@ -2,27 +2,25 @@ package cpd
|
|||
|
||||
import (
|
||||
"encoding/binary"
|
||||
|
||||
"github.com/softlandia/xlib"
|
||||
)
|
||||
|
||||
//unit for UTF16LE
|
||||
//проверка на BOM уже выполнена, в принимаемом массиве нет BOM символов
|
||||
|
||||
//русские буквы в UTF16 имеют уникальные номера
|
||||
//определять кодировку UTF16 (как LE так и BE) нужно по внутреннему устройству, не по кодам русских букв
|
||||
//проверка на BOM уже выполнена, в принимаемом массиве не BOM символов
|
||||
|
||||
// matchUTF16le - функция вычисляет общий критерий для кодировки UTF16LE
|
||||
// два критерия используется
|
||||
// первый количество найденных русских букв
|
||||
// второй количество найденных 0x00
|
||||
// решающим является максимальный
|
||||
func matchUTF16le(b []byte, tbl *cpTable) MatchRes {
|
||||
n := len(b)/2 - 1
|
||||
if n <= 0 {
|
||||
return MatchRes{0, 0}
|
||||
}
|
||||
//два критерия используется
|
||||
//первый количество найденных русских букв
|
||||
//второй количество найденных 0x00
|
||||
//решающим является максимальный
|
||||
return MatchRes{xlib.Max(matchUTF16leRu(b, tbl), matchUTF16leZerro(b)), 0}
|
||||
return MatchRes{max(matchUTF16leRu(b, tbl), matchUTF16leZerro(b)), 0}
|
||||
}
|
||||
|
||||
// matchUTF16leZerro - вычисляет критерий по количеству нулевых байтов, текст набранный латинскими символами в колировке UTF16le будет вторым символом иметь 0x00
|
||||
|
@ -64,18 +62,6 @@ func matchUTF16leRu(b []byte, tbl *cpTable) int {
|
|||
return matches
|
||||
}
|
||||
|
||||
/*func matchUTF16leFirstGreateSecond(b []byte) int {
|
||||
count := 0
|
||||
n := len(b)/2 - 1
|
||||
for i := 0; i < n; i += 2 {
|
||||
//second byte of russian char is 0x04
|
||||
if b[i] > b[i+1] {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}*/
|
||||
|
||||
const (
|
||||
cpUTF16leBeginUpperChar = 0x1004
|
||||
cpUTF16leStopUpperChar = 0x2F04
|
||||
|
|
5
utf8.go
5
utf8.go
|
@ -108,3 +108,8 @@ func isLowerUTF8(r rune) bool {
|
|||
func isUTF8(r rune) bool {
|
||||
return isUpperUTF8(r) || isLowerUTF8(r)
|
||||
}
|
||||
|
||||
//TODO need realization
|
||||
func toUTF8(s string) string {
|
||||
return s
|
||||
}
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
package cpd
|
||||
|
||||
import "sort"
|
||||
|
||||
// Max - return max of two int
|
||||
func max(x, y int) int {
|
||||
if x > y {
|
||||
return x
|
||||
}
|
||||
return y
|
||||
}
|
||||
|
||||
// SortBytes - return sorted slice of bytes
|
||||
func sortBytes(b []byte) []byte {
|
||||
sort.Slice(b, func(i, j int) bool { return b[i] < b[j] })
|
||||
return b
|
||||
}
|
Загрузка…
Ссылка в новой задаче