зеркало из https://github.com/softlandia/cpd.git
v0.2.0
This commit is contained in:
Родитель
54bfa0d068
Коммит
4387137abc
|
@ -1,3 +1,4 @@
|
|||
*.zip
|
||||
*.7z
|
||||
.idea/*
|
||||
.idea/*
|
||||
tmp*
|
|
@ -0,0 +1,17 @@
|
|||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Launch",
|
||||
"type": "go",
|
||||
"request": "launch",
|
||||
"mode": "auto",
|
||||
"program": "${fileDirname}",
|
||||
"env": {},
|
||||
"args": []
|
||||
}
|
||||
]
|
||||
}
|
Двоичный файл не отображается.
|
@ -0,0 +1,23 @@
|
|||
package cpd
|
||||
|
||||
//codePageTable
|
||||
|
||||
//return index of rune in code page table
|
||||
//return 0 if rune not in code page table
|
||||
func (t *codePageTable) containsRune(r rune) int {
|
||||
for j, e := range *t {
|
||||
if r == e.code {
|
||||
return j
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func (t *codePageTable) isUpper(r rune) bool {
|
||||
for i := 10; i < len(t); i++ {
|
||||
if r == (*t)[i].code {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
// file from "golang.org\x\text\encoding\internal\identifier" (c) golang autors
|
||||
// contain identifier of code page
|
||||
// IDCodePage implements interface String()
|
||||
|
||||
package cpd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
//IDCodePage - index of code page
|
||||
type IDCodePage uint16
|
||||
|
||||
func (i IDCodePage) String() string {
|
||||
return codePageName[i]
|
||||
}
|
||||
|
||||
//itRuneMatch - return 1 if rune from this code page, 0 else
|
||||
type itRuneMatch func(r rune, tbl *codePageTable) int
|
||||
|
||||
//runesMatch - return count of entry elements of data to code page
|
||||
type runesMatch func(data []byte, tbl *codePageTable) int
|
||||
|
||||
type tableElement struct {
|
||||
code rune //руна которая нас интересует, она присутствует в этой кодовой таблице как буква алфавита
|
||||
count int //количество вхождений данной руны
|
||||
}
|
||||
|
||||
//codePageTable - содержит основные (наиболее часто встречаемые) символы алфавита в данной кодировке
|
||||
//первые 8 прописные, 2-я восьмёрка заглавные
|
||||
type codePageTable [19]tableElement
|
||||
|
||||
//MatchRes - итоговый критерий совпадения массива данных с кодовой страницей
|
||||
type MatchRes struct {
|
||||
countMatch int
|
||||
}
|
||||
|
||||
//CodePage - содержит данные по конкретной кодовой странице
|
||||
type CodePage struct {
|
||||
id IDCodePage //id of code page
|
||||
name string //name of code page
|
||||
MatchRes //count of matching
|
||||
match runesMatch //calculate from input data count of entry to codepage
|
||||
table codePageTable //table of main alfabet rune of this code page, use for calculate frequency
|
||||
}
|
||||
|
||||
func (o CodePage) String() string {
|
||||
return fmt.Sprintf("id: %s, countMatch: %d", o.id, o.countMatch)
|
||||
}
|
||||
|
||||
//MatchingRunes - return string with rune/counts
|
||||
func (o CodePage) MatchingRunes() string {
|
||||
var sb strings.Builder
|
||||
fmt.Fprint(&sb, "rune/counts: ")
|
||||
for i, e := range o.table {
|
||||
if i != 0 {
|
||||
fmt.Fprintf(&sb, "%x/%d, ", e.code, e.count)
|
||||
}
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
//TCodePages - type for store all code page
|
||||
type TCodePages []CodePage
|
||||
|
||||
//DeepMach -
|
||||
func (o *TCodePages) DeepMach(data []byte) IDCodePage {
|
||||
return ASCII
|
||||
}
|
||||
|
||||
//Match - return IDCodePage
|
||||
//simple calculate count entry data runes in standart code page table
|
||||
func (o TCodePages) Match(data []byte) (result IDCodePage) {
|
||||
result = ASCII
|
||||
maxCount := 0
|
||||
for i, cp := range o {
|
||||
o[i].countMatch = cp.match(data, &o[i].table)
|
||||
if o[i].countMatch > maxCount {
|
||||
maxCount = o[i].countMatch
|
||||
result = cp.id
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
//CodePages - slice of code pages
|
||||
var CodePages = TCodePages{
|
||||
{ASCII, "ASCII", MatchRes{0}, runesMatchASCII,
|
||||
codePageTable{{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}},
|
||||
{IBM866, "IBM866", MatchRes{0}, runesMatch866,
|
||||
codePageTable{
|
||||
//first element serves as sign of absence
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0xAE, 0}, {0xA5, 0}, {0xA0, 0}, {0xA8, 0}, {0xAD, 0}, {0xE2, 0}, {0xE1, 0}, {0xE0, 0}, {0xA2, 0},
|
||||
{0x8E, 0}, {0x85, 0}, {0x80, 0}, {0x88, 0}, {0x8D, 0}, {0x92, 0}, {0x91, 0}, {0x90, 0}, {0x82, 0}}},
|
||||
{UTF8, "UTF8", MatchRes{0}, runesMatchUTF8,
|
||||
codePageTable{
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0xD0BE, 0}, {0xD0B5, 0}, {0xD0B0, 0}, {0xD0B8, 0}, {0xD0BD, 0}, {0xD182, 0}, {0xD181, 0}, {0xD180, 0}, {0xD0B2, 0},
|
||||
{0xD09E, 0}, {0xD095, 0}, {0xD090, 0}, {0xD098, 0}, {0xD0AD, 0}, {0xD0A2, 0}, {0xD0A1, 0}, {0xD0A0, 0}, {0xD092, 0}}},
|
||||
{Windows1251, "Windows1251", MatchRes{0}, runesMatch1251,
|
||||
codePageTable{
|
||||
{0, 0},
|
||||
//а и н с р в л к в
|
||||
{0xE0, 0}, {0xE8, 0}, {0xED, 0}, {0xF1, 0}, {0xF0, 0}, {0xE2, 0}, {0xEB, 0}, {0xEA, 0}, {0xE2, 0},
|
||||
{0xC0, 0}, {0xC8, 0}, {0xCD, 0}, {0xD1, 0}, {0xD0, 0}, {0xC2, 0}, {0xCB, 0}, {0xCA, 0}, {0xC2, 0}}},
|
||||
{KOI8R, "KOI8R", MatchRes{0}, runesMatchKOI8,
|
||||
codePageTable{
|
||||
//о а и т с в л к в
|
||||
{0, 0},
|
||||
{0xCF, 0}, {0xC1, 0}, {0xC9, 0}, {0xD4, 0}, {0xD3, 0}, {0xD7, 0}, {0xCC, 0}, {0xCB, 0}, {0xD7, 0},
|
||||
{0xEF, 0}, {0xE1, 0}, {0xE9, 0}, {0xF4, 0}, {0xF3, 0}, {0xF7, 0}, {0xEC, 0}, {0xEB, 0}, {0xF7, 0}}},
|
||||
}
|
||||
|
||||
//codePageName - string of code page name
|
||||
var codePageName = map[IDCodePage]string{
|
||||
ASCII: "ASCII",
|
||||
IBM866: "IBM866",
|
||||
Windows1251: "Windows1251",
|
||||
UTF8: "UTF8",
|
||||
UTF16: "UTF16",
|
||||
UTF16LE: "UTF16LE",
|
||||
UTF16BE: "UTF16BE",
|
||||
UTF32: "UTF32",
|
||||
KOI8R: "KOI8R",
|
||||
ISO5427Cyrillic: "ISO5427Cyrillic",
|
||||
ISO51INISCyrillic: "ISO51INISCyrillic",
|
||||
ISO111ECMACyrillic: "ISO111ECMACyrillic",
|
||||
ISO153GOST1976874: "ISO153GOST1976874",
|
||||
Unicode: "Unicode",
|
||||
}
|
|
@ -0,0 +1,111 @@
|
|||
package cpd
|
||||
|
||||
const (
|
||||
// ASCII is the uint16 identifier with IANA name US-ASCII (MIME: US-ASCII).
|
||||
// ANSI X3.4-1986
|
||||
// Reference: RFC2046
|
||||
ASCII IDCodePage = 3
|
||||
|
||||
// ISO5427Cyrillic is the uint16 identifier with IANA name ISO_5427.
|
||||
// ISO-IR: International Register of Escape Sequences
|
||||
// Note: The current registration authority is IPSJ/ITSCJ, Japan.
|
||||
// Reference: RFC1345
|
||||
ISO5427Cyrillic IDCodePage = 48
|
||||
|
||||
// ISO51INISCyrillic is the uint16 identifier with IANA name INIS-cyrillic.
|
||||
// ISO-IR: International Register of Escape Sequences
|
||||
// Note: The current registration authority is IPSJ/ITSCJ, Japan.
|
||||
// Reference: RFC1345
|
||||
ISO51INISCyrillic IDCodePage = 53
|
||||
|
||||
// ISO111ECMACyrillic is the uint16 identifier with IANA name ECMA-cyrillic.
|
||||
// ISO registry
|
||||
// (formerly ECMA registry )
|
||||
ISO111ECMACyrillic IDCodePage = 77
|
||||
|
||||
// ISO153GOST1976874 is the uint16 identifier with IANA name GOST_19768-74.
|
||||
// ISO-IR: International Register of Escape Sequences
|
||||
// Note: The current registration authority is IPSJ/ITSCJ, Japan.
|
||||
// Reference: RFC1345
|
||||
ISO153GOST1976874 IDCodePage = 94
|
||||
|
||||
// UTF8 is the uint16 identifier with IANA name UTF-8.
|
||||
//
|
||||
// rfc3629
|
||||
// Reference: RFC3629
|
||||
UTF8 IDCodePage = 106
|
||||
|
||||
// Unicode is the uint16 identifier with IANA name ISO-10646-UCS-2.
|
||||
//
|
||||
// the 2-octet Basic Multilingual Plane, aka Unicode
|
||||
// this needs to specify network byte order: the standard
|
||||
// does not specify (it is a 16-bit integer space)
|
||||
Unicode IDCodePage = 1000
|
||||
|
||||
// UnicodeASCII is the uint16 identifier with IANA name ISO-10646-UCS-Basic.
|
||||
//
|
||||
// ASCII subset of Unicode. Basic Latin = collection 1
|
||||
// See ISO 10646, Appendix A
|
||||
UnicodeASCII IDCodePage = 1002
|
||||
|
||||
// UTF7 is the uint16 identifier with IANA name UTF-7.
|
||||
//
|
||||
// rfc2152
|
||||
// Reference: RFC2152
|
||||
UTF7 IDCodePage = 1012
|
||||
|
||||
// UTF16BE is the uint16 identifier with IANA name UTF-16BE.
|
||||
//
|
||||
// rfc2781
|
||||
// Reference: RFC2781
|
||||
UTF16BE IDCodePage = 1013
|
||||
|
||||
// UTF16LE is the uint16 identifier with IANA name UTF-16LE.
|
||||
//
|
||||
// rfc2781
|
||||
// Reference: RFC2781
|
||||
UTF16LE IDCodePage = 1014
|
||||
|
||||
// UTF16 is the uint16 identifier with IANA name UTF-16.
|
||||
//
|
||||
// rfc2781
|
||||
// Reference: RFC2781
|
||||
UTF16 IDCodePage = 1015
|
||||
|
||||
// UTF32 is the uint16 identifier with IANA name UTF-32.
|
||||
//
|
||||
// https://www.unicode.org/unicode/reports/tr19/
|
||||
UTF32 IDCodePage = 1017
|
||||
|
||||
// UTF32BE is the uint16 identifier with IANA name UTF-32BE.
|
||||
//
|
||||
// https://www.unicode.org/unicode/reports/tr19/
|
||||
UTF32BE IDCodePage = 1018
|
||||
|
||||
// UTF32LE is the uint16 identifier with IANA name UTF-32LE.
|
||||
//
|
||||
// https://www.unicode.org/unicode/reports/tr19/
|
||||
UTF32LE IDCodePage = 1019
|
||||
|
||||
// KOI8R is the uint16 identifier with IANA name KOI8-R (MIME: KOI8-R).
|
||||
//
|
||||
// rfc1489 , based on GOST-19768-74, ISO-6937/8,
|
||||
// INIS-Cyrillic, ISO-5427.
|
||||
// Reference: RFC1489
|
||||
KOI8R IDCodePage = 2084
|
||||
|
||||
// IBM866 is the uint16 identifier with IANA name IBM866.
|
||||
//
|
||||
// IBM NLDG Volume 2 (SE09-8002-03) August 1994
|
||||
IBM866 IDCodePage = 2086
|
||||
|
||||
// Windows1251 is the uint16 identifier with IANA name windows-1251.
|
||||
//
|
||||
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1251
|
||||
Windows1251 IDCodePage = 2251
|
||||
|
||||
// Windows1252 is the uint16 identifier with IANA name windows-1252.
|
||||
//
|
||||
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1252
|
||||
Windows1252 IDCodePage = 2252
|
||||
)
|
15
const.go
15
const.go
|
@ -1,15 +0,0 @@
|
|||
package cpd
|
||||
|
||||
import (
|
||||
"github.com/softlandia/cpd/internal/cp"
|
||||
)
|
||||
|
||||
//numbers of code page
|
||||
const (
|
||||
CpASCII = cp.ASCII
|
||||
CpWindows1251 = cp.Windows1251
|
||||
CpIBM866 = cp.IBM866
|
||||
CpUTF8 = cp.UTF8
|
||||
CpUTF16 = cp.UTF16
|
||||
CpUTF32 = cp.UTF32
|
||||
)
|
|
@ -0,0 +1,45 @@
|
|||
package cpd
|
||||
|
||||
//checkHeader - check buffer for match to utf-8, utf-16le or utf-16be BOM
|
||||
func checkHeader(b []byte) (id IDCodePage, res bool) {
|
||||
if bomUTF8(b) {
|
||||
return UTF8, true
|
||||
}
|
||||
if bomUTF16le(b) {
|
||||
return UTF16LE, true
|
||||
}
|
||||
if bomUTF16be(b) {
|
||||
return UTF16BE, true
|
||||
}
|
||||
return ASCII, false
|
||||
}
|
||||
|
||||
func bomUTF8(b []byte) bool {
|
||||
if len(b) < 3 {
|
||||
return false
|
||||
}
|
||||
return (b[0] == 0xEF) && (b[1] == 0xBB) && (b[2] == 0xBF)
|
||||
}
|
||||
|
||||
func bomUTF16le(b []byte) bool {
|
||||
if len(b) < 2 {
|
||||
return false
|
||||
}
|
||||
return (b[0] == 0xFF) && (b[1] == 0xFE)
|
||||
}
|
||||
|
||||
func bomUTF16be(b []byte) bool {
|
||||
if len(b) < 2 {
|
||||
return false
|
||||
}
|
||||
return (b[0] == 0xFE) && (b[1] == 0xFF)
|
||||
}
|
||||
|
||||
//ASCII block
|
||||
func itASCII(r rune, tbl *codePageTable) int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func runesMatchASCII(b []byte, tbl *codePageTable) int {
|
||||
return 0
|
||||
}
|
152
cpd.go
152
cpd.go
|
@ -7,90 +7,57 @@ package cpd
|
|||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"reflect"
|
||||
|
||||
"github.com/softlandia/cpd/internal/cp"
|
||||
"golang.org/x/text/encoding/charmap"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
//StrConvertCodePage - convert string from one code page to another
|
||||
func StrConvertCodePage(s string, fromCP, toCP uint16) (string, error) {
|
||||
if len(s) == 0 {
|
||||
return "", nil
|
||||
}
|
||||
if fromCP == toCP {
|
||||
return s, nil
|
||||
}
|
||||
|
||||
var err error
|
||||
|
||||
switch fromCP {
|
||||
case cp.IBM866:
|
||||
s, _, err = transform.String(charmap.CodePage866.NewDecoder(), s)
|
||||
case cp.Windows1251:
|
||||
s, _, err = transform.String(charmap.Windows1251.NewDecoder(), s)
|
||||
}
|
||||
switch toCP {
|
||||
case cp.IBM866:
|
||||
s, _, err = transform.String(charmap.CodePage866.NewEncoder(), s)
|
||||
case cp.Windows1251:
|
||||
s, _, err = transform.String(charmap.Windows1251.NewEncoder(), s)
|
||||
}
|
||||
return s, err
|
||||
//CodePageAutoDetect - auto detect code page of input content
|
||||
func CodePageAutoDetect(content []byte) (result IDCodePage) {
|
||||
return CodePages.Match(content)
|
||||
}
|
||||
|
||||
// CodePageAsString - return name of char set with id codepage
|
||||
// if codepage not exist - return ""
|
||||
func CodePageAsString(codepage uint16) string {
|
||||
return cp.Name[codepage]
|
||||
//CodePageDetect - detect code page of ascii data from reader 'r'
|
||||
func CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error) {
|
||||
//initial test
|
||||
//test input interfase
|
||||
if !reflect.ValueOf(r).IsValid() {
|
||||
return ASCII, fmt.Errorf("input reader is nil")
|
||||
}
|
||||
|
||||
//make slice of byte from input reader
|
||||
buf, err := bufio.NewReader(r).Peek(1024)
|
||||
if (err != nil) && (err.Error() != "EOF") {
|
||||
return ASCII, err
|
||||
}
|
||||
|
||||
//check file header // utf-8, utf-16 with BOM
|
||||
if idHeader, ok := checkHeader(buf); ok {
|
||||
return idHeader, nil
|
||||
}
|
||||
return CodePageAutoDetect(buf), nil
|
||||
}
|
||||
|
||||
//CodePageDetect - detect code page of file
|
||||
//return 0 if code page can not be detected
|
||||
//return const cpd.CpWindows1251 for Windows code page 1251
|
||||
//return const cdp.CpIBM866 for IBM 866 code page
|
||||
//return conts cdp.CpASCII by default or on error
|
||||
//EF-BB-BF utf8 bom
|
||||
func CodePageDetect(fn string, stopStr ...string) (uint16, error) {
|
||||
var (
|
||||
count1251 int //счётчик символов в кодировке 1251
|
||||
count866 int //счётчик символов в кодировке 866
|
||||
)
|
||||
//FileCodePageDetect - detect code page of text file
|
||||
func FileCodePageDetect(fn string, stopStr ...string) (IDCodePage, error) {
|
||||
|
||||
iFile, err := os.Open(fn)
|
||||
if err != nil {
|
||||
return CpASCII, err
|
||||
return ASCII, err
|
||||
}
|
||||
defer iFile.Close()
|
||||
|
||||
iScanner := bufio.NewScanner(iFile)
|
||||
for i := 0; iScanner.Scan(); i++ {
|
||||
s := iScanner.Text()
|
||||
if (len(stopStr) > 0) && strings.Contains(s, stopStr[0]) { //stopStr[0] - строка при обнаружении которой останавливаемся, stopStr - слайс строк
|
||||
break
|
||||
}
|
||||
for j := range s {
|
||||
if isRune1251(rune(s[j])) { //проверка принадлежности символа позициям алфавитных символов в кодовой таблице 1251
|
||||
count1251++
|
||||
}
|
||||
if isRune866(rune(s[j])) { //проверка принадлежности символа позициям алфавитных символов в кодовой таблице 866
|
||||
count866++
|
||||
}
|
||||
}
|
||||
if len(stopStr) > 0 {
|
||||
return CodePageDetect(iFile, stopStr[0])
|
||||
}
|
||||
switch {
|
||||
case count1251 > count866:
|
||||
return CpWindows1251, nil
|
||||
case count1251 < count866:
|
||||
return CpIBM866, nil
|
||||
}
|
||||
return CpASCII, nil
|
||||
return CodePageDetect(iFile)
|
||||
}
|
||||
|
||||
//FileConvertCodePage - replace code page text file from one to another
|
||||
func FileConvertCodePage(fileName string, fromCP, toCP uint16) error {
|
||||
func FileConvertCodePage(fileName string, fromCP, toCP IDCodePage) error {
|
||||
if fromCP == toCP {
|
||||
return nil
|
||||
}
|
||||
|
@ -126,37 +93,34 @@ func FileConvertCodePage(fileName string, fromCP, toCP uint16) error {
|
|||
return os.Rename(tmpFileName, fileName)
|
||||
}
|
||||
|
||||
const (
|
||||
cp866r1Min = 0x80 //заглавная буква А
|
||||
cp866r1Max = 0xAF //строчная буква п - в этом интервале в 866 раскладке лежит большинство русских букв
|
||||
cp866r2Min = 0xE0 //строчная р
|
||||
cp866r2Max = 0xF1 //строчна ё - в этом интервале лежат остальные русские буквы
|
||||
cp1251s1 = 0xA8 //Ё
|
||||
cp1251s2 = 0xB8 //ё в этой позиции в 866 лежит псевдографика
|
||||
cp1251r1Min = 0xC0 //с этой позиции начинается весь алфавит
|
||||
cp1251r1Max = 0xFF //заканчивается
|
||||
cpKOI8RMin = 0xC0 //начало интервала
|
||||
cpKOI8RMax = 0xFF //конец интервала
|
||||
)
|
||||
|
||||
func isRune1251(r rune) bool {
|
||||
switch {
|
||||
case r == cp1251s1:
|
||||
return true
|
||||
case r == cp1251s2:
|
||||
return true
|
||||
case (r >= cp1251r1Min) && (r <= cp1251r1Max):
|
||||
return true
|
||||
//StrConvertCodePage - convert string from one code page to another
|
||||
func StrConvertCodePage(s string, fromCP, toCP IDCodePage) (string, error) {
|
||||
if len(s) == 0 {
|
||||
return "", nil
|
||||
}
|
||||
return false
|
||||
if fromCP == toCP {
|
||||
return s, nil
|
||||
}
|
||||
|
||||
var err error
|
||||
|
||||
switch fromCP {
|
||||
case IBM866:
|
||||
s, _, err = transform.String(charmap.CodePage866.NewDecoder(), s)
|
||||
case Windows1251:
|
||||
s, _, err = transform.String(charmap.Windows1251.NewDecoder(), s)
|
||||
}
|
||||
switch toCP {
|
||||
case IBM866:
|
||||
s, _, err = transform.String(charmap.CodePage866.NewEncoder(), s)
|
||||
case Windows1251:
|
||||
s, _, err = transform.String(charmap.Windows1251.NewEncoder(), s)
|
||||
}
|
||||
return s, err
|
||||
}
|
||||
|
||||
func isRune866(r rune) bool {
|
||||
switch {
|
||||
case (r >= cp866r1Min) && (r <= cp866r1Max):
|
||||
return true
|
||||
case (r >= cp866r2Min) && (r <= cp866r2Max):
|
||||
return true
|
||||
}
|
||||
return false
|
||||
// CodePageAsString - return name of char set with id codepage
|
||||
// if codepage not exist - return ""
|
||||
func CodePageAsString(codepage IDCodePage) string {
|
||||
return codePageName[codepage]
|
||||
}
|
||||
|
|
143
cpd_test.go
143
cpd_test.go
|
@ -3,20 +3,18 @@
|
|||
import (
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/softlandia/cpd/internal/cp"
|
||||
)
|
||||
|
||||
type tCodePageAsString struct {
|
||||
id uint16
|
||||
id IDCodePage
|
||||
s string
|
||||
}
|
||||
|
||||
var dCodePageAsString = []tCodePageAsString{
|
||||
{0, ""},
|
||||
{3, "ASCII"},
|
||||
{cp.IBM866, "IBM866"},
|
||||
{cp.Windows1251, "Windows1251"},
|
||||
{IBM866, "IBM866"},
|
||||
{Windows1251, "Windows1251"},
|
||||
{60000, ""},
|
||||
}
|
||||
|
||||
|
@ -29,44 +27,107 @@ func TestCodePageAsString(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
//CodePageDetect
|
||||
//TestCodePageDetect - тестирование метода CodePageDetect
|
||||
// проверки на входные параметры:
|
||||
// 1. nil входящий поток явный nil, параметр останова отсутствует
|
||||
// 2. nil, "~" входящий поток явный nil, параметр останова присутствует
|
||||
// 3. входящий поток не инициализированный объект, проверка на передачу пустого интерфейса
|
||||
// проверка работы осуществляется через FileCodePageDetect()
|
||||
func TestCodePageDetect(t *testing.T) {
|
||||
res, err := CodePageDetect("test_files\\866&1251.txt", "~X~") //befor ~X~ file contain 866, after 1251
|
||||
if err != nil {
|
||||
t.Errorf("<CodePageDetect> on file '%s' return error: %v", "866&1251.txt", err)
|
||||
}
|
||||
if res != cp.IBM866 {
|
||||
t.Errorf("<CodePageDetect> on file '%s' expected 866 got: %s", "866&1251.txt", CodePageAsString(res))
|
||||
}
|
||||
|
||||
res, err = CodePageDetect("test_files\\866&1251.txt") //file contain more 1251 then 866
|
||||
if res != cp.Windows1251 {
|
||||
t.Errorf("<CodePageDetect> on file '%s' expected 1251 got: %s", "866&1251.txt", CodePageAsString(res))
|
||||
}
|
||||
|
||||
_, err = CodePageDetect("-.-") //file "-.-" not exist
|
||||
_, err := CodePageDetect(nil)
|
||||
if err == nil {
|
||||
t.Errorf("<CodePageDetect> on file '-.-' must return error, but return nil")
|
||||
t.Errorf("<CodePageDetect> on input nil return error == nil, expect error != nil\n")
|
||||
}
|
||||
_, err = CodePageDetect(nil, "~")
|
||||
if err == nil {
|
||||
t.Errorf("<CodePageDetect> on input nil return error == nil, expect error != nil\n")
|
||||
}
|
||||
|
||||
res, _ = CodePageDetect("test_files\\noCodePage.txt") //file contain rune only ASCII
|
||||
if res != cp.ASCII {
|
||||
t.Errorf("<CodePageDetect> on file 'noCodePage.txt' expect ASCII got: %s", CodePageAsString(res))
|
||||
var data *os.File
|
||||
res, err := CodePageDetect(data, "~")
|
||||
if err == nil {
|
||||
t.Errorf("<CodePageDetect> on input nil return error != nil, data: %+v, res: %d, code page: %s\n", data, res, CodePageAsString(res))
|
||||
}
|
||||
}
|
||||
|
||||
func TestFileCodePageDetectSimple(t *testing.T) {
|
||||
res, err := FileCodePageDetect("test_files\\866to1251.txt")
|
||||
if err != nil {
|
||||
t.Errorf("<FileCodePageDetect()> on file '866to1251.txt' err expected: nil, got: %s\n", err)
|
||||
}
|
||||
if res != IBM866 {
|
||||
t.Errorf("<FileCodePageDetect()> on file '866to1251.txt' expected: %s, got: %s\n", IBM866, res)
|
||||
}
|
||||
res, err = FileCodePageDetect("test_files\\866&1251.txt")
|
||||
if err != nil {
|
||||
t.Errorf("<FileCodePageDetect()> on file '866&1251.txt' err expected: nil, got: %s\n", err)
|
||||
}
|
||||
if res != Windows1251 {
|
||||
t.Errorf("<FileCodePageDetect()> on file '866&1251.txt' expected: %s, got: %s\n", Windows1251, res)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFileCodePageDetectUtf8Bom(t *testing.T) {
|
||||
res, err := FileCodePageDetect("test_files\\utf8wbom.txt")
|
||||
if err != nil {
|
||||
t.Errorf("<FileCodePageDetect()> on file 'utf8wbom.txt' err expected: nil, got: %s\n", err)
|
||||
}
|
||||
if res != UTF8 {
|
||||
t.Errorf("<FileCodePageDetect()> on file 'utf8wbom.txt' expected: %s, got: %s\n", UTF8, res)
|
||||
}
|
||||
}
|
||||
|
||||
type tFileCodePageDetectTest struct {
|
||||
fn string //filename
|
||||
st string //stop string
|
||||
e error //
|
||||
r IDCodePage //expected result
|
||||
}
|
||||
|
||||
var dFileCodePageDetect = []tFileCodePageDetectTest{
|
||||
{"test_files\\utf16BEwbom.txt", "", nil, UTF16BE}, //file contain utf16 big endian with bom rune at start
|
||||
{"test_files\\utf16be-woBOM.txt", "", nil, UTF16BE}, //file contain utf16 big endian with out bom rune at start
|
||||
{"test_files\\utf16le-wBOM.txt", "", nil, UTF16LE}, //file contain utf16 liitle endian with bom rune at start
|
||||
{"test_files\\utf16le-woBOM.txt", "", nil, UTF16LE}, //file contain utf16 liitle endian with out bom rune at start
|
||||
{"test_files\\utf8-woBOM.txt", "", nil, UTF8}, //file contain utf8 with out bom rune at start
|
||||
{"test_files\\866&1251.txt", "~X~", nil, Windows1251}, //befor ~X~ file contain 866, after 1251
|
||||
{"test_files\\866&1251.txt", "", nil, Windows1251}, //file contain more 1251 then 866
|
||||
{"test_files\\noCodePage.txt", "", nil, ASCII}, //file contain rune only ASCII
|
||||
{"test_files\\empty_file.txt", "", nil, ASCII}, //file exist but empty, no error, return ASCII
|
||||
{"test_files\\rune_encode_error.txt", "", nil, ASCII}, //file contain special rune -> encode error, but detect NO error
|
||||
{"test_files\\rune_error_1251.txt", "", nil, Windows1251}, //file contain 1251 and special rune -> encode error, but detect NO error
|
||||
{"test_files\\utf8wbom.txt", "", nil, UTF8}, //file contain utf8 with bom rune at start
|
||||
{"test_files\\utf16LEwbom.txt", "", nil, UTF16LE}, //file contain utf16 little endian with bom rune at start
|
||||
}
|
||||
|
||||
//FileCodePageDetect
|
||||
func TestFileCodePageDetect(t *testing.T) {
|
||||
var (
|
||||
err error
|
||||
res IDCodePage
|
||||
)
|
||||
for _, d := range dFileCodePageDetect {
|
||||
if len(d.st) == 0 {
|
||||
res, err = FileCodePageDetect(d.fn)
|
||||
} else {
|
||||
res, err = FileCodePageDetect(d.fn, d.st)
|
||||
}
|
||||
if err != d.e {
|
||||
t.Errorf("<FileCodePageDetect> on file '%s' expected error: '%v', got: '%v', ", d.fn, d.e, err)
|
||||
}
|
||||
if res != d.r {
|
||||
t.Errorf("<FileCodePageDetect> on file '%s' expected result: %s, got: %s", d.fn, d.r, res)
|
||||
}
|
||||
}
|
||||
|
||||
res, err = CodePageDetect("test_files\\empty_file.txt")
|
||||
if (res != cp.ASCII) || (err != nil) {
|
||||
t.Errorf("<CodePageDetect> on file 'empty_file.txt' expect ASCII and no error got: %s and %v", CodePageAsString(res), err)
|
||||
_, err = FileCodePageDetect("-.-") //file "-.-" not exist
|
||||
if err == nil {
|
||||
t.Errorf("<FileCodePageDetect> on file '-.-' must return error, but return nil")
|
||||
}
|
||||
|
||||
res, err = CodePageDetect("test_files\\rune_encode_error.txt")
|
||||
if (res != cp.ASCII) || (err != nil) {
|
||||
t.Errorf("<CodePageDetect> on file 'rune_encode_error.txt' expect ASCII and no error got: %s and %v", CodePageAsString(res), err)
|
||||
}
|
||||
|
||||
res, err = CodePageDetect("test_files\\rune_error_1251.txt")
|
||||
if res != cp.Windows1251 {
|
||||
t.Errorf("<CodePageDetect> on file 'rune_error_1251.txt' expect 1251 and no error got: %s and %v", CodePageAsString(res), err)
|
||||
_, err = FileCodePageDetect("") //file "" not exist
|
||||
if err == nil {
|
||||
t.Errorf("<FileCodePageDetect> on file '' must return error, but return nil")
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -83,13 +144,13 @@ func TestFileConvertCodePage(t *testing.T) {
|
|||
t.Errorf("<FileConvertCodePage> on fromCp == toCp expected error==nil, got: %v", err)
|
||||
}
|
||||
|
||||
err = FileConvertCodePage("test_files\\rune_encode_error.txt", cp.IBM866, cp.Windows1251)
|
||||
err = FileConvertCodePage("test_files\\rune_encode_error.txt", IBM866, Windows1251)
|
||||
if err == nil {
|
||||
t.Errorf("<FileConvertCodePage> expected error, got: %v", err)
|
||||
}
|
||||
|
||||
os.Link("test_files\\866to1251.txt", "test_files\\866to1251.tmp")
|
||||
err = FileConvertCodePage("test_files\\866to1251.tmp", cp.IBM866, cp.Windows1251)
|
||||
err = FileConvertCodePage("test_files\\866to1251.tmp", IBM866, Windows1251)
|
||||
if err != nil {
|
||||
t.Errorf("<FileConvertCodePage> expect no err, got: %v", err)
|
||||
}
|
||||
|
@ -98,19 +159,19 @@ func TestFileConvertCodePage(t *testing.T) {
|
|||
|
||||
//ConvertCodePage
|
||||
func TestStrConvertCodePage(t *testing.T) {
|
||||
_, err := StrConvertCodePage("1234", cp.IBM866, cp.Windows1251)
|
||||
_, err := StrConvertCodePage("1234", IBM866, Windows1251)
|
||||
if err != nil {
|
||||
t.Errorf("<StrConvertCodePage> on test 1 return unexpected err: %v", err)
|
||||
}
|
||||
_, err = StrConvertCodePage("1234", cp.Windows1251, cp.IBM866)
|
||||
_, err = StrConvertCodePage("1234", Windows1251, IBM866)
|
||||
if err != nil {
|
||||
t.Errorf("<StrConvertCodePage> on test 2 return unexpected err: %v", err)
|
||||
}
|
||||
_, err = StrConvertCodePage("", cp.IBM866, cp.Windows1251)
|
||||
_, err = StrConvertCodePage("", IBM866, Windows1251)
|
||||
if err != nil {
|
||||
t.Errorf("<StrConvertCodePage> with empty string must return ERROR, but retrurn: %v", err)
|
||||
}
|
||||
_, err = StrConvertCodePage("1234", cp.IBM866, cp.IBM866)
|
||||
_, err = StrConvertCodePage("1234", IBM866, IBM866)
|
||||
if err != nil {
|
||||
t.Errorf("<StrConvertCodePage> with equal fromCP and toCp must return nil, but retrurn: %v", err)
|
||||
}
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
package cpd
|
||||
|
||||
//unit for ibm866
|
||||
|
||||
func runesMatch866(data []byte, tbl *codePageTable) (counts int) {
|
||||
for i := range data {
|
||||
if i == 0 {
|
||||
continue
|
||||
}
|
||||
if tbl.containsRune(rune(data[i-1])) > 0 {
|
||||
j := tbl.containsRune(rune(data[i]))
|
||||
if j > 0 {
|
||||
(*tbl)[j].count++
|
||||
counts++
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
1643
internal/cp/mib.go
1643
internal/cp/mib.go
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,49 @@
|
|||
package cpd
|
||||
|
||||
import "unicode"
|
||||
|
||||
//unit for koi-8
|
||||
|
||||
func runesMatchKOI8(data []byte, tbl *codePageTable) (counts int) {
|
||||
for i := range data {
|
||||
if i < 2 {
|
||||
continue
|
||||
}
|
||||
//case " Us" - separator_UPPER_symbol
|
||||
if unicode.IsPunct(rune(data[i-2])) && isUpperKOI8(rune(data[i-1])) {
|
||||
j := tbl.containsRune(rune(data[i]))
|
||||
if j > 0 {
|
||||
(*tbl)[j].count++
|
||||
counts++
|
||||
continue
|
||||
}
|
||||
}
|
||||
if isKOI8(rune(data[i-1])) {
|
||||
j := tbl.containsRune(rune(data[i]))
|
||||
if j > 0 {
|
||||
(*tbl)[j].count++
|
||||
counts++
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
const (
|
||||
cpKOI8StartUpperChar = 0xE0
|
||||
cpKOI8StopUpperChar = 0xFF
|
||||
cpKOI8StartLowerChar = 0xC0
|
||||
cpKOI8StopLowerChar = 0xDF
|
||||
)
|
||||
|
||||
func isUpperKOI8(r rune) bool {
|
||||
return (r >= cpKOI8StartUpperChar) && (r <= cpKOI8StopUpperChar)
|
||||
}
|
||||
|
||||
func isLowerKOI8(r rune) bool {
|
||||
return (r >= cpKOI8StartLowerChar) && (r <= cpKOI8StopLowerChar)
|
||||
}
|
||||
|
||||
func isKOI8(r rune) bool {
|
||||
return isUpperKOI8(r) || isLowerKOI8(r)
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
<EFBFBD>ãá᪨© ¢ ª®¤¨à®¢ª¥ IBM866
|
|
@ -0,0 +1 @@
|
|||
Русский в кодировке KOI8r
|
|
@ -0,0 +1 @@
|
|||
Русский в кодировке Windows1251
|
|
@ -1 +1 @@
|
|||
<EFBFBD>
|
||||
0
|
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
|
@ -0,0 +1 @@
|
|||
Русский в кодировке UTF8
|
|
@ -0,0 +1 @@
|
|||
Русский в кодировке UTF8
|
|
@ -0,0 +1 @@
|
|||
Utf8 w/o bom Русский
|
|
@ -0,0 +1,2 @@
|
|||
code page UTF8
|
||||
Русский
|
|
@ -0,0 +1,22 @@
|
|||
package cpd
|
||||
|
||||
import "encoding/binary"
|
||||
|
||||
//unit for UTF8
|
||||
|
||||
func runesMatchUTF8(data []byte, tbl *codePageTable) (counts int) {
|
||||
n := len(data)/2 - 1
|
||||
if n <= 0 {
|
||||
return
|
||||
}
|
||||
for i := 0; i < n; i += 2 {
|
||||
t := data[i : i+2]
|
||||
d := binary.BigEndian.Uint16(t)
|
||||
j := tbl.containsRune(rune(d))
|
||||
if j > 0 {
|
||||
(*tbl)[j].count++
|
||||
counts++
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
package cpd
|
||||
|
||||
import "unicode"
|
||||
|
||||
//unit for windows1251
|
||||
|
||||
//TODO: нужно отличить от KOI-8r
|
||||
func runesMatch1251(data []byte, tbl *codePageTable) (counts int) {
|
||||
for i := range data {
|
||||
if i < 2 {
|
||||
continue
|
||||
}
|
||||
//case " Us" - separator_UPPER_symbol
|
||||
if unicode.IsPunct(rune(data[i-2])) && isUpper1251(rune(data[i-1])) {
|
||||
j := tbl.containsRune(rune(data[i]))
|
||||
if j > 0 {
|
||||
(*tbl)[j].count++
|
||||
counts++
|
||||
continue
|
||||
}
|
||||
}
|
||||
//case "ab" - counts only if symbols are arranged in pairs
|
||||
if is1251(rune(data[i-1])) {
|
||||
j := tbl.containsRune(rune(data[i]))
|
||||
if j > 0 {
|
||||
(*tbl)[j].count++
|
||||
counts++
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
const (
|
||||
cp1251StartUpperChar = 0xC0
|
||||
cp1251StopUpperChar = 0xDF
|
||||
cp1251StartLowerChar = 0xE0
|
||||
cp1251StopLowerChar = 0xFF
|
||||
)
|
||||
|
||||
func isUpper1251(r rune) bool {
|
||||
return (r >= cp1251StartUpperChar) && (r <= cp1251StopUpperChar)
|
||||
}
|
||||
|
||||
func isLower1251(r rune) bool {
|
||||
return (r >= cp1251StartLowerChar) && (r <= cp1251StopLowerChar)
|
||||
}
|
||||
|
||||
func is1251(r rune) bool {
|
||||
return isUpper1251(r) || isLower1251(r)
|
||||
}
|
Двоичные данные
частотность букв.xlsx
Двоичные данные
частотность букв.xlsx
Двоичный файл не отображается.
Загрузка…
Ссылка в новой задаче