зеркало из https://github.com/softlandia/cpd.git
v0.2.2 add UTF32 with BOM
This commit is contained in:
Родитель
d5048a7a12
Коммит
fbe3be3cbc
|
@ -0,0 +1,46 @@
|
|||
package cpd
|
||||
|
||||
import "bytes"
|
||||
|
||||
// Boms - byte oder mark - special bytes for
|
||||
var Boms = []struct {
|
||||
bom []byte
|
||||
id IDCodePage
|
||||
}{
|
||||
{[]byte{0xef, 0xbb, 0xbf}, UTF8},
|
||||
{[]byte{0x00, 0x00, 0xfe, 0xff}, UTF32BE},
|
||||
{[]byte{0xff, 0xfe, 0x00, 0x00}, UTF32LE},
|
||||
{[]byte{0xfe, 0xff}, UTF16BE},
|
||||
{[]byte{0xff, 0xfe}, UTF16LE},
|
||||
}
|
||||
|
||||
//CheckBOM - check buffer for match to utf-8, utf-16le or utf-16be BOM
|
||||
func CheckBOM(buf []byte) (id IDCodePage, res bool) {
|
||||
for _, b := range Boms {
|
||||
if bytes.HasPrefix(buf, b.bom) {
|
||||
return b.id, true
|
||||
}
|
||||
}
|
||||
return ASCII, false
|
||||
}
|
||||
|
||||
func bomUTF8(b []byte) bool {
|
||||
return (len(b) > 3) && (b[0] == 0xEF) && (b[1] == 0xBB) && (b[2] == 0xBF)
|
||||
}
|
||||
|
||||
func bomUTF16le(b []byte) bool {
|
||||
return (len(b) > 2) && (b[0] == 0xFF) && (b[1] == 0xFE)
|
||||
}
|
||||
|
||||
func bomUTF16be(b []byte) bool {
|
||||
return (len(b) > 2) && (b[0] == 0xFE) && (b[1] == 0xFF)
|
||||
}
|
||||
|
||||
//ASCII block
|
||||
func itASCII(r rune, tbl *codePageTable) int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func runesMatchASCII(b []byte, tbl *codePageTable) int {
|
||||
return 0
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
# code page detect #
|
||||
|
||||
(c) softlandia@gmail.com
|
||||
|
||||
>download: go get -u github.com/softlandia/cpd
|
||||
>install: go install
|
||||
|
||||
библиотека на golang
|
||||
|
||||
предназначена для автоматического определения кодовой страницы текстовых файлов или потоков байт
|
||||
поддерживает следующие кодовые страницы:
|
||||
|
||||
no ID Name
|
||||
|
||||
1. ASCII: "ASCII",
|
||||
2. ISOLatinCyrillic: "ISO-8859-5",
|
||||
3. CP866: "CP866",
|
||||
4. Windows1251: "Windows-1251",
|
||||
5. UTF8: "UTF-8",
|
||||
6. UTF16LE: "UTF-16LE",
|
||||
7. UTF16BE: "UTF-16BE",
|
||||
8. UTF32: "UTF-32",
|
||||
9. KOI8R: "KOI8-R",
|
||||
10. Unicode: "Unicode",
|
||||
11. UTF7: "UTF-7",
|
||||
12. UTF32LE: "UTF-32LE",
|
||||
13. UTF32BE: "UTF-32BE",
|
||||
|
||||
## особенности ##
|
||||
|
||||
если данные содержат только латинские символы (первая половина ASCII таблицы) будет определена кодировка UTF-8
|
||||
это не является ошибкой, поскольку такой файл или данные действительно можно использовать как UTF-8
|
||||
|
||||
при использовании golang 1.12.6 в проект добавляется код размером ~240 kB
|
||||
|
||||
## зависимости ##
|
||||
|
||||
>"golang.org/x/text/encoding/charmap"
|
||||
>"golang.org/x/text/transform"
|
||||
|
||||
## типы ##
|
||||
|
||||
IDCodePage uint16 - индекс кодовой страницы, значения взяты из файла поставки golang golang.org\x\text\encoding\internal\identifier\mib.go
|
||||
поддерживается interface String(), и можно выводить так
|
||||
cp := cpd.UTF8
|
||||
fmt.Printf("code page index, name: %d, %s\n", cp, cp)
|
||||
//>code page index, name: 106, UTF-8
|
||||
|
||||
## глобальные переменные ##
|
||||
|
||||
ReadBufSize int = 1024 // количество байт считываемых из ридера (буфера) для определения кодировки
|
||||
|
||||
## функции ##
|
||||
|
||||
1. CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error)
|
||||
2. FileCodePageDetect(fn string, stopStr ...string) (IDCodePage, error)
|
||||
|
||||
## описание ##
|
||||
|
||||
CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error)
|
||||
определяет кодовую страницу считывая поток байтов из 'r'
|
||||
используется 'reflect.ValueOf(r).IsValid()' для проверки 'r' на существование
|
||||
считывает из 'r' первые ReadBufSize байтов
|
||||
параметр stopStr пока не используется
|
||||
|
||||
FileCodePageDetect(fn string, stopStr ...string) (IDCodePage, error)
|
||||
определяет кодовую страницу считывая файл 'fn', считывает из файла первые ReadBufSize байтов
|
||||
ошибку возвращает если проблемы с открытием файла 'fn'
|
||||
возвращает cpd.ASCII если колировка не определена
|
||||
|
||||
## tests ##
|
||||
|
||||
coverage: 84.0% of statements
|
||||
в папке "test_files" лежат файлы для тестов, соответственно не править и не удалять
|
10
README.md
10
README.md
|
@ -17,7 +17,8 @@ support russian code page:
|
|||
7. UTF-8
|
||||
8. ISO8859-5
|
||||
|
||||
### feature ###
|
||||
## feature ##
|
||||
|
||||
if file contain only latin symbols, this file detected as UTF-8
|
||||
this is not a mistake, this is a completely correct statement
|
||||
|
||||
|
@ -32,6 +33,10 @@ on go vertion 1.12.6 add to exe 240 kB
|
|||
|
||||
IDCodePage uint16 - index of code page, support String() interface, you can fmt.Printf("code page index, name: %d, %s\n", cp, cp) where var cp received from cpd functions
|
||||
|
||||
## variables ##
|
||||
|
||||
ReadBufSize int = 1024 // count of byte to read from input reader by default
|
||||
|
||||
## functions ##
|
||||
|
||||
1. CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error)
|
||||
|
@ -43,6 +48,9 @@ IDCodePage uint16 - index of code page, support String() interface, you can fmt.
|
|||
|
||||
CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error)
|
||||
detect code page of ascii data from reader 'r'
|
||||
use library 'reflect' to check input reader
|
||||
default read only first 1024 byte from 'r' (var ReadBufSize to change this setting)
|
||||
input parameter stopStr not using
|
||||
|
||||
FileCodePageDetect(fn string, stopStr ...string) (IDCodePage, error)
|
||||
detect code page of text file "fn", read first 1024 byte (var ReadBufSize to change this setting)
|
||||
|
|
113
code_pages.go
113
code_pages.go
|
@ -17,6 +17,7 @@ func (i IDCodePage) String() string {
|
|||
}
|
||||
|
||||
//itRuneMatch - return 1 if rune from this code page, 0 else
|
||||
// function exist in every CodePage
|
||||
type itRuneMatch func(r rune, tbl *codePageTable) int
|
||||
|
||||
//runesMatch - return count of entry elements of data to code page
|
||||
|
@ -32,6 +33,7 @@ type tableElement struct {
|
|||
type codePageTable [19]tableElement
|
||||
|
||||
//MatchRes - итоговый критерий совпадения массива данных с кодовой страницей
|
||||
// возможно в дальнейшем усложнится
|
||||
type MatchRes struct {
|
||||
countMatch int
|
||||
}
|
||||
|
@ -61,14 +63,95 @@ func (o CodePage) MatchingRunes() string {
|
|||
return sb.String()
|
||||
}
|
||||
|
||||
//TCodepagesDic - type to store all supported code page
|
||||
type TCodepagesDic map[IDCodePage]CodePage
|
||||
|
||||
//Match - return the id of code page to which the data best matches
|
||||
func (o TCodepagesDic) Match(data []byte) (result IDCodePage) {
|
||||
result = ASCII
|
||||
maxCount := 0
|
||||
for id, cp := range o {
|
||||
cp.countMatch = cp.match(data, &cp.table)
|
||||
o[id] = cp
|
||||
if cp.countMatch > maxCount {
|
||||
maxCount = cp.countMatch
|
||||
result = id
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
//CodepageDic -
|
||||
var CodepageDic = TCodepagesDic{
|
||||
ASCII: {ASCII, "ASCII", MatchRes{0}, runesMatchASCII,
|
||||
codePageTable{{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}}},
|
||||
|
||||
CP866: {CP866, "CP866", MatchRes{0}, runesMatch866,
|
||||
codePageTable{
|
||||
//first element serves as sign of absence
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0xAE, 0}, {0xA5, 0}, {0xA0, 0}, {0xA8, 0}, {0xAD, 0}, {0xE2, 0}, {0xE1, 0}, {0xE0, 0}, {0xA2, 0},
|
||||
{0x8E, 0}, {0x85, 0}, {0x80, 0}, {0x88, 0}, {0x8D, 0}, {0x92, 0}, {0x91, 0}, {0x90, 0}, {0x82, 0}}},
|
||||
UTF8: {UTF8, "UTF8", MatchRes{0}, runesMatchUTF8,
|
||||
codePageTable{
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0xD0BE, 0}, {0xD0B5, 0}, {0xD0B0, 0}, {0xD0B8, 0}, {0xD0BD, 0}, {0xD182, 0}, {0xD181, 0}, {0xD180, 0}, {0xD0B2, 0},
|
||||
{0xD09E, 0}, {0xD095, 0}, {0xD090, 0}, {0xD098, 0}, {0xD0AD, 0}, {0xD0A2, 0}, {0xD0A1, 0}, {0xD0A0, 0}, {0xD092, 0}}},
|
||||
Windows1251: {Windows1251, "Windows1251", MatchRes{0}, runesMatch1251,
|
||||
codePageTable{
|
||||
{0, 0},
|
||||
//а и н с р в л к в
|
||||
{0xE0, 0}, {0xE8, 0}, {0xED, 0}, {0xF1, 0}, {0xF0, 0}, {0xE2, 0}, {0xEB, 0}, {0xEA, 0}, {0xE2, 0},
|
||||
{0xC0, 0}, {0xC8, 0}, {0xCD, 0}, {0xD1, 0}, {0xD0, 0}, {0xC2, 0}, {0xCB, 0}, {0xCA, 0}, {0xC2, 0}}},
|
||||
KOI8R: {KOI8R, "KOI8R", MatchRes{0}, runesMatchKOI8,
|
||||
codePageTable{
|
||||
//о а и т с в л к м
|
||||
{0, 0},
|
||||
{0xCF, 0}, {0xC1, 0}, {0xC9, 0}, {0xD4, 0}, {0xD3, 0}, {0xD7, 0}, {0xCC, 0}, {0xCB, 0}, {0xCD, 0},
|
||||
{0xEF, 0}, {0xE1, 0}, {0xE9, 0}, {0xF4, 0}, {0xF3, 0}, {0xF7, 0}, {0xEC, 0}, {0xEB, 0}, {0xED, 0}}},
|
||||
ISOLatinCyrillic: {ISOLatinCyrillic, "ISO-8859-5", MatchRes{0}, runesMatchISO88595,
|
||||
codePageTable{
|
||||
//о а и т с в л к е
|
||||
{0, 0},
|
||||
{0xDE, 0}, {0xD0, 0}, {0xD8, 0}, {0xE2, 0}, {0xE1, 0}, {0xD2, 0}, {0xDB, 0}, {0xDA, 0}, {0xD5, 0},
|
||||
{0xBF, 0}, {0xB0, 0}, {0xB8, 0}, {0xC2, 0}, {0xC1, 0}, {0xB2, 0}, {0xBB, 0}, {0xBA, 0}, {0xB5, 0}}},
|
||||
UTF16LE: {UTF16LE, "UTF16LE", MatchRes{0}, runesMatchUTF16LE,
|
||||
codePageTable{
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0x3E04, 0}, {0x3504, 0}, {0x1004, 0}, {0x3804, 0}, {0x3D04, 0}, {0x4204, 0}, {0x4104, 0}, {0x4004, 0}, {0x3204, 0},
|
||||
{0x1E04, 0}, {0x1504, 0}, {0x3004, 0}, {0x1804, 0}, {0x1D04, 0}, {0x2204, 0}, {0x2104, 0}, {0x2004, 0}, {0x1204, 0}}},
|
||||
UTF16BE: {UTF16BE, "UTF16BE", MatchRes{0}, runesMatchUTF16BE,
|
||||
codePageTable{
|
||||
{0, 0},
|
||||
//о е а и н т с р в
|
||||
{0x043E, 0}, {0x0435, 0}, {0x0410, 0}, {0x0438, 0}, {0x043D, 0}, {0x0442, 0}, {0x0441, 0}, {0x0440, 0}, {0x0432, 0},
|
||||
{0x041E, 0}, {0x0415, 0}, {0x0430, 0}, {0x0418, 0}, {0x041D, 0}, {0x0422, 0}, {0x0421, 0}, {0x0420, 0}, {0x0412, 0}}},
|
||||
}
|
||||
|
||||
//codePageName - string of code page name
|
||||
var codePageName = map[IDCodePage]string{
|
||||
ASCII: "ASCII",
|
||||
ISOLatinCyrillic: "ISO-8859-5",
|
||||
CP866: "CP866",
|
||||
Windows1251: "Windows-1251",
|
||||
UTF8: "UTF-8",
|
||||
UTF16LE: "UTF-16LE",
|
||||
UTF16BE: "UTF-16BE",
|
||||
UTF32: "UTF-32",
|
||||
KOI8R: "KOI8-R",
|
||||
Unicode: "Unicode",
|
||||
UTF7: "UTF-7",
|
||||
UTF32LE: "UTF-32LE",
|
||||
UTF32BE: "UTF-32BE",
|
||||
}
|
||||
|
||||
/*
|
||||
//TCodePages - type for store all code page
|
||||
type TCodePages []CodePage
|
||||
|
||||
//DeepMach -
|
||||
func (o *TCodePages) DeepMach(data []byte) IDCodePage {
|
||||
return ASCII
|
||||
}
|
||||
|
||||
//Match - return IDCodePage
|
||||
//simple calculate count entry data runes in standart code page table
|
||||
func (o TCodePages) Match(data []byte) (result IDCodePage) {
|
||||
|
@ -84,6 +167,11 @@ func (o TCodePages) Match(data []byte) (result IDCodePage) {
|
|||
return result
|
||||
}
|
||||
|
||||
//DeepMach -
|
||||
func (o *TCodePages) DeepMach(data []byte) IDCodePage {
|
||||
return ASCII
|
||||
}
|
||||
|
||||
//CodePages - slice of code pages
|
||||
var CodePages = TCodePages{
|
||||
{ASCII, "ASCII", MatchRes{0}, runesMatchASCII,
|
||||
|
@ -132,17 +220,4 @@ var CodePages = TCodePages{
|
|||
{0x043E, 0}, {0x0435, 0}, {0x0410, 0}, {0x0438, 0}, {0x043D, 0}, {0x0442, 0}, {0x0441, 0}, {0x0440, 0}, {0x0432, 0},
|
||||
{0x041E, 0}, {0x0415, 0}, {0x0430, 0}, {0x0418, 0}, {0x041D, 0}, {0x0422, 0}, {0x0421, 0}, {0x0420, 0}, {0x0412, 0}}},
|
||||
}
|
||||
|
||||
//codePageName - string of code page name
|
||||
var codePageName = map[IDCodePage]string{
|
||||
ASCII: "ASCII",
|
||||
ISOLatinCyrillic: "ISO-8859-5",
|
||||
IBM866: "IBM866",
|
||||
Windows1251: "Windows1251",
|
||||
UTF8: "UTF8",
|
||||
UTF16LE: "UTF16LE",
|
||||
UTF16BE: "UTF16BE",
|
||||
UTF32: "UTF32",
|
||||
KOI8R: "KOI8R",
|
||||
Unicode: "Unicode",
|
||||
}
|
||||
*/
|
||||
|
|
|
@ -72,10 +72,10 @@ const (
|
|||
// Reference: RFC1489
|
||||
KOI8R IDCodePage = 2084
|
||||
|
||||
// IBM866 is the uint16 identifier with IANA name IBM866.
|
||||
// CP866 is the uint16 identifier with IANA name IBM866.
|
||||
//
|
||||
// IBM NLDG Volume 2 (SE09-8002-03) August 1994
|
||||
IBM866 IDCodePage = 2086
|
||||
CP866 IDCodePage = 2086
|
||||
|
||||
// Windows1251 is the uint16 identifier with IANA name windows-1251.
|
||||
//
|
||||
|
|
|
@ -1,45 +0,0 @@
|
|||
package cpd
|
||||
|
||||
//checkHeader - check buffer for match to utf-8, utf-16le or utf-16be BOM
|
||||
func checkHeader(b []byte) (id IDCodePage, res bool) {
|
||||
if bomUTF8(b) {
|
||||
return UTF8, true
|
||||
}
|
||||
if bomUTF16le(b) {
|
||||
return UTF16LE, true
|
||||
}
|
||||
if bomUTF16be(b) {
|
||||
return UTF16BE, true
|
||||
}
|
||||
return ASCII, false
|
||||
}
|
||||
|
||||
func bomUTF8(b []byte) bool {
|
||||
if len(b) < 3 {
|
||||
return false
|
||||
}
|
||||
return (b[0] == 0xEF) && (b[1] == 0xBB) && (b[2] == 0xBF)
|
||||
}
|
||||
|
||||
func bomUTF16le(b []byte) bool {
|
||||
if len(b) < 2 {
|
||||
return false
|
||||
}
|
||||
return (b[0] == 0xFF) && (b[1] == 0xFE)
|
||||
}
|
||||
|
||||
func bomUTF16be(b []byte) bool {
|
||||
if len(b) < 2 {
|
||||
return false
|
||||
}
|
||||
return (b[0] == 0xFE) && (b[1] == 0xFF)
|
||||
}
|
||||
|
||||
//ASCII block
|
||||
func itASCII(r rune, tbl *codePageTable) int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func runesMatchASCII(b []byte, tbl *codePageTable) int {
|
||||
return 0
|
||||
}
|
59
cpd.go
59
cpd.go
|
@ -1,7 +1,5 @@
|
|||
//Package cpd - code page detect
|
||||
// (c) 2019 softlandia@gmail.com
|
||||
// v0.1.0
|
||||
// 01/oct/2019
|
||||
package cpd
|
||||
|
||||
import (
|
||||
|
@ -18,9 +16,19 @@ import (
|
|||
//ReadBufSize - byte count for reading from file, func FileCodePageDetect()
|
||||
var ReadBufSize int = 1024
|
||||
|
||||
//CodePageAutoDetect - auto detect code page of input content
|
||||
func CodePageAutoDetect(content []byte) (result IDCodePage) {
|
||||
return CodePages.Match(content)
|
||||
//FileCodePageDetect - detect code page of text file
|
||||
func FileCodePageDetect(fn string, stopStr ...string) (IDCodePage, error) {
|
||||
|
||||
iFile, err := os.Open(fn)
|
||||
if err != nil {
|
||||
return ASCII, err
|
||||
}
|
||||
defer iFile.Close()
|
||||
|
||||
if len(stopStr) > 0 {
|
||||
return CodePageDetect(iFile, stopStr[0])
|
||||
}
|
||||
return CodePageDetect(iFile)
|
||||
}
|
||||
|
||||
//CodePageDetect - detect code page of ascii data from reader 'r'
|
||||
|
@ -37,45 +45,35 @@ func CodePageDetect(r io.Reader, stopStr ...string) (IDCodePage, error) {
|
|||
return ASCII, err
|
||||
}
|
||||
|
||||
//check file header // utf-8, utf-16 with BOM
|
||||
if idCodePage, ok := checkHeader(buf); ok {
|
||||
//is buf contains the BOM of utf-8, utf-16le or utf-16be
|
||||
if idCodePage, ok := CheckBOM(buf); ok {
|
||||
return idCodePage, nil
|
||||
}
|
||||
|
||||
//check data for UTF
|
||||
if IsUtf8(buf) {
|
||||
if ValidUTF8(buf) {
|
||||
return UTF8, nil
|
||||
}
|
||||
|
||||
return CodePageAutoDetect(buf), nil
|
||||
}
|
||||
|
||||
//FileCodePageDetect - detect code page of text file
|
||||
func FileCodePageDetect(fn string, stopStr ...string) (IDCodePage, error) {
|
||||
|
||||
iFile, err := os.Open(fn)
|
||||
if err != nil {
|
||||
return ASCII, err
|
||||
}
|
||||
defer iFile.Close()
|
||||
|
||||
if len(stopStr) > 0 {
|
||||
return CodePageDetect(iFile, stopStr[0])
|
||||
}
|
||||
return CodePageDetect(iFile)
|
||||
//CodePageAutoDetect - auto detect code page of input content
|
||||
func CodePageAutoDetect(content []byte) (result IDCodePage) {
|
||||
return CodepageDic.Match(content) //TODO большинству матчеров требуется более 2х символов, надо проверить на минимальную длину
|
||||
}
|
||||
|
||||
//FileConvertCodePage - replace code page text file from one to another
|
||||
// support convert only from/to Windows1251/IBM866
|
||||
func FileConvertCodePage(fileName string, fromCP, toCP IDCodePage) error {
|
||||
if fromCP == toCP {
|
||||
return nil
|
||||
}
|
||||
|
||||
if (fromCP != Windows1251) && (fromCP != IBM866) {
|
||||
if (fromCP != Windows1251) && (fromCP != CP866) {
|
||||
return nil
|
||||
}
|
||||
|
||||
if (toCP != Windows1251) && (toCP != IBM866) {
|
||||
if (toCP != Windows1251) && (toCP != CP866) {
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -101,7 +99,7 @@ func FileConvertCodePage(fileName string, fromCP, toCP IDCodePage) error {
|
|||
if err != nil {
|
||||
oFile.Close()
|
||||
os.Remove(tmpFileName)
|
||||
return fmt.Errorf("cde page convert error on file '%s': %v", fileName, err)
|
||||
return fmt.Errorf("code page convert error on file '%s': %v", fileName, err)
|
||||
}
|
||||
fmt.Fprintf(oFile, "%s\n", s)
|
||||
}
|
||||
|
@ -110,7 +108,14 @@ func FileConvertCodePage(fileName string, fromCP, toCP IDCodePage) error {
|
|||
return os.Rename(tmpFileName, fileName)
|
||||
}
|
||||
|
||||
//ToUTF8 -
|
||||
//TODO need realization
|
||||
func ToUTF8(s string) string {
|
||||
return s
|
||||
}
|
||||
|
||||
//StrConvertCodePage - convert string from one code page to another
|
||||
// function for future, at now support convert only from/to Windows1251/IBM866
|
||||
func StrConvertCodePage(s string, fromCP, toCP IDCodePage) (string, error) {
|
||||
if len(s) == 0 {
|
||||
return "", nil
|
||||
|
@ -122,13 +127,13 @@ func StrConvertCodePage(s string, fromCP, toCP IDCodePage) (string, error) {
|
|||
var err error
|
||||
|
||||
switch fromCP {
|
||||
case IBM866:
|
||||
case CP866:
|
||||
s, _, err = transform.String(charmap.CodePage866.NewDecoder(), s)
|
||||
case Windows1251:
|
||||
s, _, err = transform.String(charmap.Windows1251.NewDecoder(), s)
|
||||
}
|
||||
switch toCP {
|
||||
case IBM866:
|
||||
case CP866:
|
||||
s, _, err = transform.String(charmap.CodePage866.NewEncoder(), s)
|
||||
case Windows1251:
|
||||
s, _, err = transform.String(charmap.Windows1251.NewEncoder(), s)
|
||||
|
|
141
cpd_test.go
141
cpd_test.go
|
@ -13,8 +13,8 @@ type tCodePageAsString struct {
|
|||
var dCodePageAsString = []tCodePageAsString{
|
||||
{0, ""},
|
||||
{3, "ASCII"},
|
||||
{IBM866, "IBM866"},
|
||||
{Windows1251, "Windows1251"},
|
||||
{CP866, "CP866"},
|
||||
{Windows1251, "Windows-1251"},
|
||||
{60000, ""},
|
||||
}
|
||||
|
||||
|
@ -27,6 +27,61 @@ func TestCodePageAsString(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
type tFileCodePageDetectTest struct {
|
||||
fn string //filename
|
||||
st string //stop string
|
||||
e error //
|
||||
r IDCodePage //expected result
|
||||
}
|
||||
|
||||
var dFileCodePageDetect = []tFileCodePageDetectTest{
|
||||
{"test_files\\utf32le-wBOM.txt", "", nil, UTF32LE}, //file contain utf32 little endian with bom
|
||||
{"test_files\\KOI8-r.txt", "", nil, KOI8R}, //file contain KOI8
|
||||
{"test_files\\IBM866.txt", "", nil, CP866}, //file contain IBM866
|
||||
{"test_files\\Win1251.txt", "", nil, Windows1251}, //file contain Windows1251
|
||||
{"test_files\\utf8-woBOM.txt", "", nil, UTF8}, //file contain utf8 with out bom rune at start
|
||||
{"test_files\\866&1251.txt", "", nil, Windows1251}, //file contain more 1251 then 866
|
||||
{"test_files\\noCodePage.txt", "", nil, UTF8}, //file contain rune only ASCII
|
||||
{"test_files\\empty_file.txt", "", nil, UTF8}, //file exist but empty, no error, return ASCII
|
||||
{"test_files\\rune_encode_error.txt", "", nil, ASCII}, //file contain special rune -> encode error, but detect NO error
|
||||
{"test_files\\rune_error_1251.txt", "", nil, Windows1251}, //file contain 1251 and special rune -> encode error, but detect NO error
|
||||
{"test_files\\utf8wbom.txt", "", nil, UTF8}, //file contain utf8 with bom prefix
|
||||
{"test_files\\utf16LEwbom.txt", "", nil, UTF16LE}, //file contain utf16 little endian with BOM
|
||||
{"test_files\\utf16BEwbom.txt", "", nil, UTF16BE}, //file contain utf16 big endian with BOM
|
||||
{"test_files\\utf16le-wBOM.txt", "", nil, UTF16LE}, //file contain utf16 little endian with bom
|
||||
{"test_files\\utf16le-woBOM.txt", "", nil, UTF16LE}, //file contain utf16 little endian without bom
|
||||
{"test_files\\utf16be-woBOM.txt", "", nil, UTF16BE}, //file contain utf16 big endian without bom
|
||||
{"test_files\\utf32be-wBOM.txt", "", nil, UTF32BE}, //file contain utf32 big endian with bom
|
||||
}
|
||||
|
||||
//FileCodePageDetect
|
||||
func TestFileCodePageDetect(t *testing.T) {
|
||||
var (
|
||||
err error
|
||||
res IDCodePage
|
||||
)
|
||||
for _, d := range dFileCodePageDetect {
|
||||
res, err = FileCodePageDetect(d.fn)
|
||||
if err != d.e {
|
||||
t.Errorf("<FileCodePageDetect> on file '%s' expected error: '%v', got: '%v', ", d.fn, d.e, err)
|
||||
}
|
||||
if res != d.r {
|
||||
t.Errorf("<FileCodePageDetect> on file '%s' expected result: %s, got: %s", d.fn, d.r, res)
|
||||
}
|
||||
}
|
||||
|
||||
_, err = FileCodePageDetect("-.-") //file "-.-" not exist
|
||||
if err == nil {
|
||||
t.Errorf("<FileCodePageDetect> on file '-.-' must return error, but return nil")
|
||||
}
|
||||
|
||||
_, err = FileCodePageDetect("") //file "" not exist
|
||||
if err == nil {
|
||||
t.Errorf("<FileCodePageDetect> on file '' must return error, but return nil")
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//TestCodePageDetect - тестирование метода CodePageDetect
|
||||
// проверки на входные параметры:
|
||||
// 1. nil входящий поток явный nil, параметр останова отсутствует
|
||||
|
@ -55,8 +110,8 @@ func TestFileCodePageDetectSimple(t *testing.T) {
|
|||
if err != nil {
|
||||
t.Errorf("<FileCodePageDetect()> on file '866to1251.txt' err expected: nil, got: %s\n", err)
|
||||
}
|
||||
if res != IBM866 {
|
||||
t.Errorf("<FileCodePageDetect()> on file '866to1251.txt' expected: %s, got: %s\n", IBM866, res)
|
||||
if res != CP866 {
|
||||
t.Errorf("<FileCodePageDetect()> on file '866to1251.txt' expected: %s, got: %s\n", CP866, res)
|
||||
}
|
||||
res, err = FileCodePageDetect("test_files\\866&1251.txt")
|
||||
if err != nil {
|
||||
|
@ -77,93 +132,35 @@ func TestFileCodePageDetectUtf8Bom(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
type tFileCodePageDetectTest struct {
|
||||
fn string //filename
|
||||
st string //stop string
|
||||
e error //
|
||||
r IDCodePage //expected result
|
||||
}
|
||||
|
||||
var dFileCodePageDetect = []tFileCodePageDetectTest{
|
||||
{"test_files\\KOI8-r.txt", "", nil, KOI8R}, //file contain KOI8
|
||||
{"test_files\\IBM866.txt", "", nil, IBM866}, //file contain IBM866
|
||||
{"test_files\\Win1251.txt", "", nil, Windows1251}, //file contain Windows1251
|
||||
{"test_files\\utf16BEwbom.txt", "", nil, UTF16BE}, //file contain utf16 big endian with bom rune at start
|
||||
{"test_files\\utf16le-wBOM.txt", "", nil, UTF16LE}, //file contain utf16 liitle endian with bom rune at start
|
||||
{"test_files\\utf8-woBOM.txt", "", nil, UTF8}, //file contain utf8 with out bom rune at start
|
||||
{"test_files\\866&1251.txt", "~X~", nil, Windows1251}, //befor ~X~ file contain 866, after 1251
|
||||
{"test_files\\866&1251.txt", "", nil, Windows1251}, //file contain more 1251 then 866
|
||||
{"test_files\\noCodePage.txt", "", nil, UTF8}, //file contain rune only ASCII
|
||||
{"test_files\\empty_file.txt", "", nil, UTF8}, //file exist but empty, no error, return ASCII
|
||||
{"test_files\\rune_encode_error.txt", "", nil, ASCII}, //file contain special rune -> encode error, but detect NO error
|
||||
{"test_files\\rune_error_1251.txt", "", nil, Windows1251}, //file contain 1251 and special rune -> encode error, but detect NO error
|
||||
{"test_files\\utf8wbom.txt", "", nil, UTF8}, //file contain utf8 with bom rune at start
|
||||
{"test_files\\utf16LEwbom.txt", "", nil, UTF16LE}, //file contain utf16 little endian with bom rune at start
|
||||
{"test_files\\utf16le-woBOM.txt", "", nil, UTF16LE}, //file contain utf16 liitle endian with out bom rune at start
|
||||
{"test_files\\utf16be-woBOM.txt", "", nil, UTF16BE}, //file contain utf16 big endian with out bom rune at start
|
||||
}
|
||||
|
||||
//FileCodePageDetect
|
||||
func TestFileCodePageDetect(t *testing.T) {
|
||||
var (
|
||||
err error
|
||||
res IDCodePage
|
||||
)
|
||||
for _, d := range dFileCodePageDetect {
|
||||
if len(d.st) == 0 {
|
||||
res, err = FileCodePageDetect(d.fn)
|
||||
} else {
|
||||
res, err = FileCodePageDetect(d.fn, d.st)
|
||||
}
|
||||
if err != d.e {
|
||||
t.Errorf("<FileCodePageDetect> on file '%s' expected error: '%v', got: '%v', ", d.fn, d.e, err)
|
||||
}
|
||||
if res != d.r {
|
||||
t.Errorf("<FileCodePageDetect> on file '%s' expected result: %s, got: %s", d.fn, d.r, res)
|
||||
}
|
||||
}
|
||||
|
||||
_, err = FileCodePageDetect("-.-") //file "-.-" not exist
|
||||
if err == nil {
|
||||
t.Errorf("<FileCodePageDetect> on file '-.-' must return error, but return nil")
|
||||
}
|
||||
|
||||
_, err = FileCodePageDetect("") //file "" not exist
|
||||
if err == nil {
|
||||
t.Errorf("<FileCodePageDetect> on file '' must return error, but return nil")
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//FileConvertCodePage
|
||||
func TestFileConvertCodePage(t *testing.T) {
|
||||
err := FileConvertCodePage("", IBM866, Windows1251)
|
||||
err := FileConvertCodePage("", CP866, Windows1251)
|
||||
if err == nil {
|
||||
t.Errorf("<FileConvertCodePage> on empty file name expected error, got: %v", err)
|
||||
}
|
||||
|
||||
err = FileConvertCodePage("", IBM866, IBM866)
|
||||
err = FileConvertCodePage("", CP866, CP866)
|
||||
if err != nil {
|
||||
t.Errorf("<FileConvertCodePage> on fromCp == toCp expected error==nil, got: %v", err)
|
||||
}
|
||||
|
||||
err = FileConvertCodePage("123", UTF8, IBM866)
|
||||
err = FileConvertCodePage("123", UTF8, CP866)
|
||||
if err != nil {
|
||||
t.Errorf("<FileConvertCodePage> on fromCp or toCp not Windows1251 or IBM866 expected error == nil, got: %v", err)
|
||||
}
|
||||
|
||||
err = FileConvertCodePage("123", IBM866, UTF16LE)
|
||||
err = FileConvertCodePage("123", CP866, UTF16LE)
|
||||
if err != nil {
|
||||
t.Errorf("<FileConvertCodePage> on fromCp or toCp not Windows1251 or IBM866 expected error == nil, got: %v", err)
|
||||
}
|
||||
|
||||
err = FileConvertCodePage("test_files\\rune_encode_error.txt", IBM866, Windows1251)
|
||||
err = FileConvertCodePage("test_files\\rune_encode_error.txt", CP866, Windows1251)
|
||||
if err == nil {
|
||||
t.Errorf("<FileConvertCodePage> expected error, got: %v", err)
|
||||
}
|
||||
|
||||
os.Link("test_files\\866to1251.txt", "test_files\\866to1251.tmp")
|
||||
err = FileConvertCodePage("test_files\\866to1251.tmp", IBM866, Windows1251)
|
||||
err = FileConvertCodePage("test_files\\866to1251.tmp", CP866, Windows1251)
|
||||
if err != nil {
|
||||
t.Errorf("<FileConvertCodePage> expect no err, got: %v", err)
|
||||
}
|
||||
|
@ -172,19 +169,19 @@ func TestFileConvertCodePage(t *testing.T) {
|
|||
|
||||
//ConvertCodePage
|
||||
func TestStrConvertCodePage(t *testing.T) {
|
||||
_, err := StrConvertCodePage("1234", IBM866, Windows1251)
|
||||
_, err := StrConvertCodePage("1234", CP866, Windows1251)
|
||||
if err != nil {
|
||||
t.Errorf("<StrConvertCodePage> on test 1 return unexpected err: %v", err)
|
||||
}
|
||||
_, err = StrConvertCodePage("1234", Windows1251, IBM866)
|
||||
_, err = StrConvertCodePage("1234", Windows1251, CP866)
|
||||
if err != nil {
|
||||
t.Errorf("<StrConvertCodePage> on test 2 return unexpected err: %v", err)
|
||||
}
|
||||
_, err = StrConvertCodePage("", IBM866, Windows1251)
|
||||
_, err = StrConvertCodePage("", CP866, Windows1251)
|
||||
if err != nil {
|
||||
t.Errorf("<StrConvertCodePage> with empty string must return ERROR, but retrurn: %v", err)
|
||||
}
|
||||
_, err = StrConvertCodePage("1234", IBM866, IBM866)
|
||||
_, err = StrConvertCodePage("1234", CP866, CP866)
|
||||
if err != nil {
|
||||
t.Errorf("<StrConvertCodePage> with equal fromCP and toCp must return nil, but retrurn: %v", err)
|
||||
}
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
з «® ў Є®¤Ёа®ўЄҐ 866 Ё Ґс Ўг¤Ґв ¬®Ј®
|
||||
~X~
|
||||
пример 1251 ТОЧНО ЖУЙ эти булочки
|
||||
~A
|
||||
<OK>
|
||||
ё Ё
|
||||
<OK>
|
||||
Б ФЕРЕТШ KOI8 ОЕНОПЗП
|
||||
АгббЪШЩ Т ЪЮФШаЮТЪХ ISO8859-5
|
|
@ -0,0 +1,16 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/softlandia/cpd"
|
||||
)
|
||||
|
||||
func main() {
|
||||
t, _ := cpd.FileCodePageDetect(os.Args[1])
|
||||
fmt.Printf("cpd.FileCodePageDetect():\t%s\n", t)
|
||||
for id, cp := range cpd.CodepageDic {
|
||||
fmt.Printf("%s, %s\n", id, cp.MatchingRunes())
|
||||
}
|
||||
}
|
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
4
utf8.go
4
utf8.go
|
@ -51,8 +51,8 @@ func testUTF8bitPattern(b byte) (int, cp int32) {
|
|||
return 0, 0
|
||||
}
|
||||
|
||||
//IsUtf8 - return true if imput slice contain true UTF-8
|
||||
func IsUtf8(data []byte) bool {
|
||||
//ValidUTF8 - return true if imput slice contain true UTF-8
|
||||
func ValidUTF8(data []byte) bool {
|
||||
m := len(data)
|
||||
if m <= 1 {
|
||||
return true
|
||||
|
|
|
@ -4,7 +4,6 @@ import "unicode"
|
|||
|
||||
//unit for windows1251
|
||||
|
||||
//TODO: нужно отличить от KOI-8r
|
||||
func runesMatch1251(data []byte, tbl *codePageTable) (counts int) {
|
||||
for i := range data {
|
||||
if i < 2 {
|
||||
|
|
Загрузка…
Ссылка в новой задаче