This commit is contained in:
softlandia 2020-01-10 02:09:09 +04:00
Родитель 9c2bbb0422
Коммит 5166d2704f
7 изменённых файлов: 38 добавлений и 127 удалений

9
cpd.go
Просмотреть файл

@ -7,7 +7,6 @@ import (
"fmt"
"io"
"os"
"reflect"
"unicode"
"golang.org/x/text/encoding/charmap"
@ -35,13 +34,13 @@ func FileCodePageDetect(fn string, stopStr ...string) (IDCodePage, error) {
//CodePageDetect - detect code page of ascii data from reader 'r'
func CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error) {
//test input interfase
if !reflect.ValueOf(r).IsValid() {
return ASCII, fmt.Errorf("input reader is nil")
if r == nil {
return ASCII, nil
}
//make slice of byte from input reader
buf, err := bufio.NewReader(r).Peek(ReadBufSize)
if (err != nil) && (err.Error() != "EOF") {
//if (err != nil) && (err.Error() != "EOF") {
if (err != nil) && (err != io.EOF) {
return ASCII, err
}

Просмотреть файл

@ -35,28 +35,28 @@ type tFileCodePageDetectTest struct {
}
var dFileCodePageDetect = []tFileCodePageDetectTest{
{"test_files\\866&1251.txt", "", nil, CP1251}, //file contain more 1251 then 866
{"test_files\\empty_file.txt", "", nil, UTF8}, //file exist but empty, no error, return ASCII
{"test_files\\IBM866.txt", "", nil, CP866}, //file contain IBM866
{"test_files\\ISO8859-5.txt", "", nil, ISOLatinCyrillic}, //file contain ISO8859-5
{"test_files\\KOI8-r.txt", "", nil, KOI8R}, //file contain KOI8
{"test_files\\KOI8-r2.txt", "", nil, KOI8R}, //file contain KOI8
{"test_files\\noCodePage.txt", "", nil, UTF8}, //file contain rune only ASCII
{"test_files\\rune_encode_error.txt", "", nil, ASCII}, //file contain special rune -> encode error, but detect NO error
{"test_files\\rune_error_1251.txt", "", nil, CP1251}, //file contain 1251 and special rune -> encode error, but detect NO error
{"test_files\\utf8.txt", "", nil, UTF8}, //file contain utf8 with out bom rune at start
{"test_files\\utf8-wbom.txt", "", nil, UTF8}, //file contain utf8 with bom prefix
{"test_files\\utf8-woBOM.txt", "", nil, UTF8}, //file contain utf8 with out bom rune at start
{"test_files\\utf16be-wBOM.txt", "", nil, UTF16BE}, //file contain utf16 big endian with bom
{"test_files\\utf16be-woBOM.txt", "", nil, UTF16BE}, //file contain utf16 big endian without bom
{"test_files\\utf16le-wBOM.txt", "", nil, UTF16LE}, //file contain utf16 little endian with bom
{"test_files\\utf16le-woBOM.txt", "", nil, UTF16LE}, //file contain utf16 little endian without bom
{"test_files\\utf32be-wBOM.txt", "", nil, UTF32BE}, //file contain utf32 big endian with bom
{"test_files\\utf32be-woBOM.txt", "", nil, UTF32BE}, //file contain utf32 big endian without bom
{"test_files\\utf32le-wBOM.txt", "", nil, UTF32LE}, //file contain utf32 little endian with bom
{"test_files\\utf32le-woBOM.txt", "", nil, UTF32LE}, //file contain utf32 little endian without bom
{"test_files\\Win1251.txt", "", nil, CP1251}, //file contain Windows1251
{"test_files\\win1251_upper.txt", "", nil, CP1251}, //file contain Windows1251
{"test_files\\866&1251.txt", "", nil, CP1251}, //file contain more 1251 then 866
{"test_files\\empty_file.txt", "", nil, UTF8}, //file exist but empty, no error, return ASCII
{"test_files\\IBM866.txt", "", nil, CP866}, //file contain IBM866
{"test_files\\ISO8859-5.txt", "", nil, ISOLatinCyrillic}, //file contain ISO8859-5
{"test_files\\KOI8-r.txt", "", nil, KOI8R}, //file contain KOI8
{"test_files\\KOI8-r2.txt", "", nil, KOI8R}, //file contain KOI8
{"test_files\\noCodePage.txt", "", nil, UTF8}, //file contain rune only ASCII
{"test_files\\rune_encode_error.txt", "", nil, ISOLatinCyrillic}, //file contain special rune -> encode error, but detect NO error
{"test_files\\rune_error_1251.txt", "", nil, CP1251}, //file contain 1251 and special rune -> encode error, but detect NO error
{"test_files\\utf8.txt", "", nil, UTF8}, //file contain utf8 with out bom rune at start
{"test_files\\utf8-wbom.txt", "", nil, UTF8}, //file contain utf8 with bom prefix
{"test_files\\utf8-woBOM.txt", "", nil, UTF8}, //file contain utf8 with out bom rune at start
{"test_files\\utf16be-wBOM.txt", "", nil, UTF16BE}, //file contain utf16 big endian with bom
{"test_files\\utf16be-woBOM.txt", "", nil, UTF16BE}, //file contain utf16 big endian without bom
{"test_files\\utf16le-wBOM.txt", "", nil, UTF16LE}, //file contain utf16 little endian with bom
{"test_files\\utf16le-woBOM.txt", "", nil, UTF16LE}, //file contain utf16 little endian without bom
{"test_files\\utf32be-wBOM.txt", "", nil, UTF32BE}, //file contain utf32 big endian with bom
{"test_files\\utf32be-woBOM.txt", "", nil, UTF32BE}, //file contain utf32 big endian without bom
{"test_files\\utf32le-wBOM.txt", "", nil, UTF32LE}, //file contain utf32 little endian with bom
{"test_files\\utf32le-woBOM.txt", "", nil, UTF32LE}, //file contain utf32 little endian without bom
{"test_files\\Win1251.txt", "", nil, CP1251}, //file contain Windows1251
{"test_files\\win1251_upper.txt", "", nil, CP1251}, //file contain Windows1251
}
//FileCodePageDetect
@ -84,7 +84,6 @@ func TestFileCodePageDetect(t *testing.T) {
if err == nil {
t.Errorf("<FileCodePageDetect> on file '' must return error, but return nil")
}
}
//TestCodePageDetect - тестирование метода CodePageDetect
@ -92,15 +91,15 @@ func TestFileCodePageDetect(t *testing.T) {
// 1. nil входящий поток явный nil, параметр останова отсутствует
// 2. nil, "~" входящий поток явный nil, параметр останова присутствует
// 3. входящий поток не инициализированный объект, проверка на передачу пустого интерфейса
// проверка работы осуществляется через FileCodePageDetect()
// проверка самой работы осуществляется через FileCodePageDetect()
func TestCodePageDetect(t *testing.T) {
_, err := CodePageDetect(nil)
if err == nil {
t.Errorf("<CodePageDetect> on input nil return error == nil, expect error != nil\n")
tmp, err := CodePageDetect(nil)
if (err != nil) && (tmp != ASCII) {
t.Errorf("<CodePageDetect> on input nil return error != nil or code page != ASCII\n")
}
_, err = CodePageDetect(nil, "~")
if err == nil {
t.Errorf("<CodePageDetect> on input nil return error == nil, expect error != nil\n")
tmp, err = CodePageDetect(nil, "~")
if (err != nil) && (tmp != ASCII) {
t.Errorf("<CodePageDetect> on input nil return error != nil or code page != ASCII\n")
}
var data *os.File

Просмотреть файл

@ -16,7 +16,7 @@ func match866(data []byte, tbl *codePageTable) MatchRes {
}
const (
cp866BeginUpperChar = 0x80
cp866StartUpperChar = 0x80
cp866StopUpperChar = 0x9F
cp866BeginLowerChar1 = 0xA0
cp866StopLowerChar1 = 0xAF
@ -25,7 +25,7 @@ const (
)
func isUpper866(r byte) bool {
return (r >= cp866BeginUpperChar) && (r <= cp866StopUpperChar)
return (r >= cp866StartUpperChar) && (r <= cp866StopUpperChar)
}
func isLower866(r byte) bool {

Просмотреть файл

@ -1,7 +1,5 @@
package cpd
import "unicode"
//unit for ISO-8859-5
func matchISO88595(d []byte, tbl *codePageTable) MatchRes {
@ -21,31 +19,6 @@ func matchISO88595(d []byte, tbl *codePageTable) MatchRes {
return MatchRes{tbl.founded(), 0}
}
func runesMatchISO88595_2(data []byte, tbl *codePageTable) (counts int) {
for i := range data {
if i < 2 {
continue
}
//case " Us" - separator_UPPER_symbol
if unicode.IsPunct(rune(data[i-2])) && isUpperISO88595(rune(data[i-1])) {
j := tbl.index(rune(data[i]))
if j > 0 {
(*tbl)[j].count++
counts++
continue
}
}
if isISO88595(rune(data[i-1])) {
j := tbl.index(rune(data[i]))
if j > 0 {
(*tbl)[j].count++
counts++
}
}
}
return
}
const (
cpISO88595BeginUpperChar = 0xB0
cpISO88595StopUpperChar = 0xCF
@ -61,11 +34,11 @@ func lu88595(r byte) (res int) {
}
func isUpperISO88595(r rune) bool {
return (r >= cpKOI8BeginUpperChar) && (r <= cpKOI8StopUpperChar)
return (r >= cpISO88595BeginUpperChar) && (r <= cpISO88595StopUpperChar)
}
func isLowerISO88595(r rune) bool {
return (r >= cpKOI8BeginLowerChar) && (r <= cpKOI8StopLowerChar)
return (r >= cpISO88595BeginLowerChar) && (r <= cpISO88595StopLowerChar)
}
func isISO88595(r rune) bool {

27
koi8.go
Просмотреть файл

@ -1,7 +1,5 @@
package cpd
import "unicode"
//unit for koi-8
var consonansKOI8 = [256]byte{
@ -77,31 +75,6 @@ func matchRuneKOI8(d []byte, tbl *codePageTable) int {
return tbl.founded()
}
func runesMatchKOI8_2(data []byte, tbl *codePageTable) (counts int) {
for i := range data {
if i < 2 {
continue
}
//case " Us" - separator_UPPER_symbol
if unicode.IsPunct(rune(data[i-2])) && isUpperKOI8(data[i-1]) {
j := tbl.index(rune(data[i]))
if j > 0 {
(*tbl)[j].count++
counts++
continue
}
}
if isKOI8(data[i-1]) {
j := tbl.index(rune(data[i]))
if j > 0 {
(*tbl)[j].count++
counts++
}
}
}
return
}
const (
cpKOI8BeginUpperChar = 0xE0
cpKOI8StopUpperChar = 0xFF

Просмотреть файл

@ -87,10 +87,7 @@ func ValidUTF8(data []byte) bool {
return false
}
}
if float64(zerroByteCount)/float64(m) > 0.05 {
return false
}
return true
return float64(zerroByteCount)/float64(m) < 0.05
}
const (

Просмотреть файл

@ -77,36 +77,6 @@ func match1251(d []byte, tbl *codePageTable) MatchRes {
return MatchRes{matchRune1251(d, tbl), cvPairs1251(d)}
}
func runesMatch1251_1(d []byte, tbl *codePageTable) (counts int) {
for i := range d {
if i < 1 {
continue
}
//case "ab" - counts only if symbols are arranged in pairs
if is1251(d[i-1]) {
j := tbl.index(rune(d[i]))
if j > 0 {
(*tbl)[j].count++
counts++
}
continue
}
if i < 2 {
continue
}
//case " Us" separator_UPPER_lower
if IsSeparator(rune(d[i-2])) && isUpper1251(d[i-1]) {
j := tbl.index(rune(d[i]))
if (j > 0) && (isLower1251(d[i])) {
(*tbl)[j].count++
counts++
continue
}
}
}
return
}
const (
cp1251BeginUpperChar = 0xC0
cp1251StopUpperChar = 0xDF