зеркало из https://github.com/softlandia/cpd.git
v0.3.1 exclude reflect
This commit is contained in:
Родитель
9c2bbb0422
Коммит
5166d2704f
9
cpd.go
9
cpd.go
|
@ -7,7 +7,6 @@ import (
|
|||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"reflect"
|
||||
"unicode"
|
||||
|
||||
"golang.org/x/text/encoding/charmap"
|
||||
|
@ -35,13 +34,13 @@ func FileCodePageDetect(fn string, stopStr ...string) (IDCodePage, error) {
|
|||
//CodePageDetect - detect code page of ascii data from reader 'r'
|
||||
func CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error) {
|
||||
//test input interfase
|
||||
if !reflect.ValueOf(r).IsValid() {
|
||||
return ASCII, fmt.Errorf("input reader is nil")
|
||||
if r == nil {
|
||||
return ASCII, nil
|
||||
}
|
||||
|
||||
//make slice of byte from input reader
|
||||
buf, err := bufio.NewReader(r).Peek(ReadBufSize)
|
||||
if (err != nil) && (err.Error() != "EOF") {
|
||||
//if (err != nil) && (err.Error() != "EOF") {
|
||||
if (err != nil) && (err != io.EOF) {
|
||||
return ASCII, err
|
||||
}
|
||||
|
||||
|
|
59
cpd_test.go
59
cpd_test.go
|
@ -35,28 +35,28 @@ type tFileCodePageDetectTest struct {
|
|||
}
|
||||
|
||||
var dFileCodePageDetect = []tFileCodePageDetectTest{
|
||||
{"test_files\\866&1251.txt", "", nil, CP1251}, //file contain more 1251 then 866
|
||||
{"test_files\\empty_file.txt", "", nil, UTF8}, //file exist but empty, no error, return ASCII
|
||||
{"test_files\\IBM866.txt", "", nil, CP866}, //file contain IBM866
|
||||
{"test_files\\ISO8859-5.txt", "", nil, ISOLatinCyrillic}, //file contain ISO8859-5
|
||||
{"test_files\\KOI8-r.txt", "", nil, KOI8R}, //file contain KOI8
|
||||
{"test_files\\KOI8-r2.txt", "", nil, KOI8R}, //file contain KOI8
|
||||
{"test_files\\noCodePage.txt", "", nil, UTF8}, //file contain rune only ASCII
|
||||
{"test_files\\rune_encode_error.txt", "", nil, ASCII}, //file contain special rune -> encode error, but detect NO error
|
||||
{"test_files\\rune_error_1251.txt", "", nil, CP1251}, //file contain 1251 and special rune -> encode error, but detect NO error
|
||||
{"test_files\\utf8.txt", "", nil, UTF8}, //file contain utf8 with out bom rune at start
|
||||
{"test_files\\utf8-wbom.txt", "", nil, UTF8}, //file contain utf8 with bom prefix
|
||||
{"test_files\\utf8-woBOM.txt", "", nil, UTF8}, //file contain utf8 with out bom rune at start
|
||||
{"test_files\\utf16be-wBOM.txt", "", nil, UTF16BE}, //file contain utf16 big endian with bom
|
||||
{"test_files\\utf16be-woBOM.txt", "", nil, UTF16BE}, //file contain utf16 big endian without bom
|
||||
{"test_files\\utf16le-wBOM.txt", "", nil, UTF16LE}, //file contain utf16 little endian with bom
|
||||
{"test_files\\utf16le-woBOM.txt", "", nil, UTF16LE}, //file contain utf16 little endian without bom
|
||||
{"test_files\\utf32be-wBOM.txt", "", nil, UTF32BE}, //file contain utf32 big endian with bom
|
||||
{"test_files\\utf32be-woBOM.txt", "", nil, UTF32BE}, //file contain utf32 big endian without bom
|
||||
{"test_files\\utf32le-wBOM.txt", "", nil, UTF32LE}, //file contain utf32 little endian with bom
|
||||
{"test_files\\utf32le-woBOM.txt", "", nil, UTF32LE}, //file contain utf32 little endian without bom
|
||||
{"test_files\\Win1251.txt", "", nil, CP1251}, //file contain Windows1251
|
||||
{"test_files\\win1251_upper.txt", "", nil, CP1251}, //file contain Windows1251
|
||||
{"test_files\\866&1251.txt", "", nil, CP1251}, //file contain more 1251 then 866
|
||||
{"test_files\\empty_file.txt", "", nil, UTF8}, //file exist but empty, no error, return ASCII
|
||||
{"test_files\\IBM866.txt", "", nil, CP866}, //file contain IBM866
|
||||
{"test_files\\ISO8859-5.txt", "", nil, ISOLatinCyrillic}, //file contain ISO8859-5
|
||||
{"test_files\\KOI8-r.txt", "", nil, KOI8R}, //file contain KOI8
|
||||
{"test_files\\KOI8-r2.txt", "", nil, KOI8R}, //file contain KOI8
|
||||
{"test_files\\noCodePage.txt", "", nil, UTF8}, //file contain rune only ASCII
|
||||
{"test_files\\rune_encode_error.txt", "", nil, ISOLatinCyrillic}, //file contain special rune -> encode error, but detect NO error
|
||||
{"test_files\\rune_error_1251.txt", "", nil, CP1251}, //file contain 1251 and special rune -> encode error, but detect NO error
|
||||
{"test_files\\utf8.txt", "", nil, UTF8}, //file contain utf8 with out bom rune at start
|
||||
{"test_files\\utf8-wbom.txt", "", nil, UTF8}, //file contain utf8 with bom prefix
|
||||
{"test_files\\utf8-woBOM.txt", "", nil, UTF8}, //file contain utf8 with out bom rune at start
|
||||
{"test_files\\utf16be-wBOM.txt", "", nil, UTF16BE}, //file contain utf16 big endian with bom
|
||||
{"test_files\\utf16be-woBOM.txt", "", nil, UTF16BE}, //file contain utf16 big endian without bom
|
||||
{"test_files\\utf16le-wBOM.txt", "", nil, UTF16LE}, //file contain utf16 little endian with bom
|
||||
{"test_files\\utf16le-woBOM.txt", "", nil, UTF16LE}, //file contain utf16 little endian without bom
|
||||
{"test_files\\utf32be-wBOM.txt", "", nil, UTF32BE}, //file contain utf32 big endian with bom
|
||||
{"test_files\\utf32be-woBOM.txt", "", nil, UTF32BE}, //file contain utf32 big endian without bom
|
||||
{"test_files\\utf32le-wBOM.txt", "", nil, UTF32LE}, //file contain utf32 little endian with bom
|
||||
{"test_files\\utf32le-woBOM.txt", "", nil, UTF32LE}, //file contain utf32 little endian without bom
|
||||
{"test_files\\Win1251.txt", "", nil, CP1251}, //file contain Windows1251
|
||||
{"test_files\\win1251_upper.txt", "", nil, CP1251}, //file contain Windows1251
|
||||
}
|
||||
|
||||
//FileCodePageDetect
|
||||
|
@ -84,7 +84,6 @@ func TestFileCodePageDetect(t *testing.T) {
|
|||
if err == nil {
|
||||
t.Errorf("<FileCodePageDetect> on file '' must return error, but return nil")
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//TestCodePageDetect - тестирование метода CodePageDetect
|
||||
|
@ -92,15 +91,15 @@ func TestFileCodePageDetect(t *testing.T) {
|
|||
// 1. nil входящий поток явный nil, параметр останова отсутствует
|
||||
// 2. nil, "~" входящий поток явный nil, параметр останова присутствует
|
||||
// 3. входящий поток не инициализированный объект, проверка на передачу пустого интерфейса
|
||||
// проверка работы осуществляется через FileCodePageDetect()
|
||||
// проверка самой работы осуществляется через FileCodePageDetect()
|
||||
func TestCodePageDetect(t *testing.T) {
|
||||
_, err := CodePageDetect(nil)
|
||||
if err == nil {
|
||||
t.Errorf("<CodePageDetect> on input nil return error == nil, expect error != nil\n")
|
||||
tmp, err := CodePageDetect(nil)
|
||||
if (err != nil) && (tmp != ASCII) {
|
||||
t.Errorf("<CodePageDetect> on input nil return error != nil or code page != ASCII\n")
|
||||
}
|
||||
_, err = CodePageDetect(nil, "~")
|
||||
if err == nil {
|
||||
t.Errorf("<CodePageDetect> on input nil return error == nil, expect error != nil\n")
|
||||
tmp, err = CodePageDetect(nil, "~")
|
||||
if (err != nil) && (tmp != ASCII) {
|
||||
t.Errorf("<CodePageDetect> on input nil return error != nil or code page != ASCII\n")
|
||||
}
|
||||
|
||||
var data *os.File
|
||||
|
|
|
@ -16,7 +16,7 @@ func match866(data []byte, tbl *codePageTable) MatchRes {
|
|||
}
|
||||
|
||||
const (
|
||||
cp866BeginUpperChar = 0x80
|
||||
cp866StartUpperChar = 0x80
|
||||
cp866StopUpperChar = 0x9F
|
||||
cp866BeginLowerChar1 = 0xA0
|
||||
cp866StopLowerChar1 = 0xAF
|
||||
|
@ -25,7 +25,7 @@ const (
|
|||
)
|
||||
|
||||
func isUpper866(r byte) bool {
|
||||
return (r >= cp866BeginUpperChar) && (r <= cp866StopUpperChar)
|
||||
return (r >= cp866StartUpperChar) && (r <= cp866StopUpperChar)
|
||||
}
|
||||
|
||||
func isLower866(r byte) bool {
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
package cpd
|
||||
|
||||
import "unicode"
|
||||
|
||||
//unit for ISO-8859-5
|
||||
|
||||
func matchISO88595(d []byte, tbl *codePageTable) MatchRes {
|
||||
|
@ -21,31 +19,6 @@ func matchISO88595(d []byte, tbl *codePageTable) MatchRes {
|
|||
return MatchRes{tbl.founded(), 0}
|
||||
}
|
||||
|
||||
func runesMatchISO88595_2(data []byte, tbl *codePageTable) (counts int) {
|
||||
for i := range data {
|
||||
if i < 2 {
|
||||
continue
|
||||
}
|
||||
//case " Us" - separator_UPPER_symbol
|
||||
if unicode.IsPunct(rune(data[i-2])) && isUpperISO88595(rune(data[i-1])) {
|
||||
j := tbl.index(rune(data[i]))
|
||||
if j > 0 {
|
||||
(*tbl)[j].count++
|
||||
counts++
|
||||
continue
|
||||
}
|
||||
}
|
||||
if isISO88595(rune(data[i-1])) {
|
||||
j := tbl.index(rune(data[i]))
|
||||
if j > 0 {
|
||||
(*tbl)[j].count++
|
||||
counts++
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
const (
|
||||
cpISO88595BeginUpperChar = 0xB0
|
||||
cpISO88595StopUpperChar = 0xCF
|
||||
|
@ -61,11 +34,11 @@ func lu88595(r byte) (res int) {
|
|||
}
|
||||
|
||||
func isUpperISO88595(r rune) bool {
|
||||
return (r >= cpKOI8BeginUpperChar) && (r <= cpKOI8StopUpperChar)
|
||||
return (r >= cpISO88595BeginUpperChar) && (r <= cpISO88595StopUpperChar)
|
||||
}
|
||||
|
||||
func isLowerISO88595(r rune) bool {
|
||||
return (r >= cpKOI8BeginLowerChar) && (r <= cpKOI8StopLowerChar)
|
||||
return (r >= cpISO88595BeginLowerChar) && (r <= cpISO88595StopLowerChar)
|
||||
}
|
||||
|
||||
func isISO88595(r rune) bool {
|
||||
|
|
27
koi8.go
27
koi8.go
|
@ -1,7 +1,5 @@
|
|||
package cpd
|
||||
|
||||
import "unicode"
|
||||
|
||||
//unit for koi-8
|
||||
|
||||
var consonansKOI8 = [256]byte{
|
||||
|
@ -77,31 +75,6 @@ func matchRuneKOI8(d []byte, tbl *codePageTable) int {
|
|||
return tbl.founded()
|
||||
}
|
||||
|
||||
func runesMatchKOI8_2(data []byte, tbl *codePageTable) (counts int) {
|
||||
for i := range data {
|
||||
if i < 2 {
|
||||
continue
|
||||
}
|
||||
//case " Us" - separator_UPPER_symbol
|
||||
if unicode.IsPunct(rune(data[i-2])) && isUpperKOI8(data[i-1]) {
|
||||
j := tbl.index(rune(data[i]))
|
||||
if j > 0 {
|
||||
(*tbl)[j].count++
|
||||
counts++
|
||||
continue
|
||||
}
|
||||
}
|
||||
if isKOI8(data[i-1]) {
|
||||
j := tbl.index(rune(data[i]))
|
||||
if j > 0 {
|
||||
(*tbl)[j].count++
|
||||
counts++
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
const (
|
||||
cpKOI8BeginUpperChar = 0xE0
|
||||
cpKOI8StopUpperChar = 0xFF
|
||||
|
|
5
utf8.go
5
utf8.go
|
@ -87,10 +87,7 @@ func ValidUTF8(data []byte) bool {
|
|||
return false
|
||||
}
|
||||
}
|
||||
if float64(zerroByteCount)/float64(m) > 0.05 {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
return float64(zerroByteCount)/float64(m) < 0.05
|
||||
}
|
||||
|
||||
const (
|
||||
|
|
30
win1251.go
30
win1251.go
|
@ -77,36 +77,6 @@ func match1251(d []byte, tbl *codePageTable) MatchRes {
|
|||
return MatchRes{matchRune1251(d, tbl), cvPairs1251(d)}
|
||||
}
|
||||
|
||||
func runesMatch1251_1(d []byte, tbl *codePageTable) (counts int) {
|
||||
for i := range d {
|
||||
if i < 1 {
|
||||
continue
|
||||
}
|
||||
//case "ab" - counts only if symbols are arranged in pairs
|
||||
if is1251(d[i-1]) {
|
||||
j := tbl.index(rune(d[i]))
|
||||
if j > 0 {
|
||||
(*tbl)[j].count++
|
||||
counts++
|
||||
}
|
||||
continue
|
||||
}
|
||||
if i < 2 {
|
||||
continue
|
||||
}
|
||||
//case " Us" separator_UPPER_lower
|
||||
if IsSeparator(rune(d[i-2])) && isUpper1251(d[i-1]) {
|
||||
j := tbl.index(rune(d[i]))
|
||||
if (j > 0) && (isLower1251(d[i])) {
|
||||
(*tbl)[j].count++
|
||||
counts++
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
const (
|
||||
cp1251BeginUpperChar = 0xC0
|
||||
cp1251StopUpperChar = 0xDF
|
||||
|
|
Загрузка…
Ссылка в новой задаче