go.text/language: bunch of bug fixes in extension handling:

[this time with the correct files:]
Measures to expose bugs:
  - Added more elaborate tests for extensions.
  - Return end instead of len(scan.b) at the end of parseExtension
    to force exposing bugs.
  - parseExtensions used to sometimes update scan.b and sometimes not,
    leaving it to parse. Made this more consistent, simplifying parse
        and forcing errors to be exposed.
  - Removed some checks to catch errors that should have been caught
    elsewhere. Again to expose bugs.
  - Tightened some of the checks to expose bugs more easily.

Bugs fixed:
  - Attributes in -u extension are no sorted, as per LDML spec
    (even though nobody uses them, a spec is a spec).
  - Fixed various bug where invalid keys or value were not properly
    removed. Merged the special case and common case to eliminate rare
        code paths and simplify testing.
  - Fixed some bugs where invalid empty extensions were not properly
    removed.
  - Fixed bug in Compose, which dropped the 'w', '9', and 'z' extensions.

Other:
  - removed parsePrivate to simplify code.description here.

R=r
CC=golang-dev
https://golang.org/cl/14669043
This commit is contained in:
Marcel van Lohuizen 2013-10-16 11:12:07 +02:00
Родитель 4fe0ccd82b
Коммит 51fb595f78
2 изменённых файлов: 106 добавлений и 54 удалений

Просмотреть файл

@ -154,14 +154,14 @@ func (s *scanner) gobble(e error) {
s.next = s.start
}
// deleteRange removes the given range and sets the scanning position to the first
// token after the deleted range.
// deleteRange removes the given range from s.b before the current token.
func (s *scanner) deleteRange(start, end int) {
s.setError(errSyntax)
s.b = s.b[:start-1+copy(s.b[start-1:], s.b[end:])]
s.end = start - 1
s.start, s.next = start, start
s.scan()
s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
diff := end - start
s.next -= diff
s.start -= diff
s.end -= diff
}
// scan parses the next token of a BCP 47 string. Tokens that are larger
@ -242,7 +242,10 @@ func parse(scan *scanner, s string) (t Tag, err error) {
var end int
if n := len(scan.token); n <= 1 {
scan.toLower(0, len(scan.b))
end = parsePrivate(scan)
if n == 0 || scan.token[0] != 'x' {
return t, errSyntax
}
end = parseExtensions(scan)
} else if n >= 4 {
return und, errSyntax
} else { // the usual case
@ -250,14 +253,13 @@ func parse(scan *scanner, s string) (t Tag, err error) {
if n := len(scan.token); n == 1 {
t.pExt = uint16(end)
end = parseExtensions(scan)
} else if end < len(scan.b) {
scan.setError(errSyntax)
scan.b = scan.b[:end]
}
}
if end < len(scan.b) {
scan.setError(errSyntax)
scan.b = scan.b[:end]
}
if len(scan.b) < len(s) {
s = s[:len(scan.b)]
if end < len(s) {
s = s[:end]
}
if len(s) > 0 && cmp(s, scan.b) == 0 {
t.str = &s
@ -367,10 +369,13 @@ func parseVariants(scan *scanner, end int, t Tag) int {
k++
l = w
}
str := bytes.Join(variant[:k], separator)
scan.resizeRange(start, end, len(str))
copy(scan.b[scan.start:], str)
end = scan.end
if str := bytes.Join(variant[:k], separator); len(str) == 0 {
end = start - 1
} else {
scan.resizeRange(start, end, len(str))
copy(scan.b[scan.start:], str)
end = scan.end
}
}
return end
}
@ -454,40 +459,64 @@ func (b bytesSort) Less(i, j int) bool {
// parseExtensions parses and normalizes the extensions in the buffer.
// It returns the last position of scan.b that is part of any extension.
// TODO: return errors.
// It also trims scan.b to remove excess parts accordingly.
func parseExtensions(scan *scanner) int {
start := scan.start
exts := [][]byte{}
private := []byte{}
end := scan.end
for len(scan.token) == 1 {
start := scan.start
extension := []byte{}
extStart := scan.start
ext := scan.token[0]
switch ext {
case 'u':
attrEnd := scan.acceptMinSize(3)
end = attrEnd
var key []byte
for last := []byte{}; len(scan.token) == 2; last = key {
key = scan.token
keyStart, keyEnd := scan.start, scan.end
end = scan.acceptMinSize(3)
if keyEnd == end {
scan.deleteRange(keyStart, end)
continue
attrStart := end
scan.scan()
for last := []byte{}; len(scan.token) > 2; scan.scan() {
if bytes.Compare(scan.token, last) != -1 {
// Attributes are unsorted. Start over from scratch.
p := attrStart + 1
scan.next = p
attrs := [][]byte{}
for scan.scan(); len(scan.token) > 2; scan.scan() {
attrs = append(attrs, scan.token)
end = scan.end
}
sort.Sort(bytesSort(attrs))
copy(scan.b[p:], bytes.Join(attrs, separator))
break
}
last = scan.token
end = scan.end
}
var last, key []byte
for attrEnd := end; len(scan.token) == 2; last = key {
key = scan.token
keyEnd := scan.end
end = scan.acceptMinSize(3)
// TODO: check key value validity
if bytes.Compare(key, last) != 1 {
if keyEnd == end || bytes.Compare(key, last) != 1 {
// We have an invalid key or the keys are not sorted.
// Start scanning keys from scratch and reorder.
p := attrEnd + 1
scan.next = p
keys := [][]byte{}
for scan.scan(); len(scan.token) == 2; {
keyStart := scan.start
keyStart, keyEnd := scan.start, scan.end
end = scan.acceptMinSize(3)
keys = append(keys, scan.b[keyStart:end])
if keyEnd != end {
keys = append(keys, scan.b[keyStart:end])
} else {
scan.setError(errSyntax)
end = keyStart
}
}
sort.Sort(bytesSort(keys))
reordered := bytes.Join(keys, separator)
if e := p + len(reordered); e < end {
scan.deleteRange(e, end)
end = e
}
copy(scan.b[p:], bytes.Join(keys, separator))
break
}
@ -496,7 +525,7 @@ func parseExtensions(scan *scanner) int {
scan.scan()
if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
_, end = parseTag(scan)
scan.toLower(start, end)
scan.toLower(extStart, end)
}
for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
end = scan.acceptMinSize(3)
@ -506,11 +535,13 @@ func parseExtensions(scan *scanner) int {
default:
end = scan.acceptMinSize(2)
}
extension = scan.b[start:end]
if len(extension) < 3 {
extension := scan.b[extStart:end]
if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
scan.setError(errSyntax)
end = extStart
continue
} else if len(exts) == 0 && (ext == 'x' || scan.next >= len(scan.b)) {
} else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
scan.b = scan.b[:end]
return end
} else if ext == 'x' {
private = extension
@ -518,23 +549,18 @@ func parseExtensions(scan *scanner) int {
}
exts = append(exts, extension)
}
if scan.next < len(scan.b) {
scan.setError(errSyntax)
}
sort.Sort(bytesSort(exts))
if len(private) > 0 {
exts = append(exts, private)
}
scan.b = append(scan.b[:start], bytes.Join(exts, separator)...)
return len(scan.b)
}
func parsePrivate(scan *scanner) int {
if len(scan.token) == 0 || scan.token[0] != 'x' {
scan.setError(errSyntax)
return scan.start
scan.b = scan.b[:start]
if len(exts) > 0 {
scan.b = append(scan.b, bytes.Join(exts, separator)...)
} else if start > 0 {
// Strip trailing '-'.
scan.b = scan.b[:start-1]
}
return parseExtensions(scan)
return end
}
// A Part identifies a part of the language tag.
@ -589,13 +615,13 @@ func Compose(m map[Part]string) (t Tag, err error) {
}
add(p)
}
for p := Part('0'); p < Part('9'); p++ {
for p := Part('0'); p <= Part('9'); p++ {
add(p)
}
for p := Part('a'); p < Part('w'); p++ {
for p := Part('a'); p <= Part('w'); p++ {
add(p)
}
for p := Part('y'); p < Part('z'); p++ {
for p := Part('y'); p <= Part('z'); p++ {
add(p)
}
add(Part('x'))

Просмотреть файл

@ -123,6 +123,7 @@ func parseTests() []parseTest {
{in: "de-Latn-1901", lang: "de", script: "Latn", variants: "1901"},
// Do not accept with incorrect script
{in: "de-Cyrl-1901", lang: "de", script: "Cyrl", variants: "", invalid: true},
{in: "de-Cyrl-1902", lang: "de", script: "Cyrl", variants: "", invalid: true},
// Specialized.
{in: "sl-rozaj", lang: "sl", variants: "rozaj"},
{in: "sl-rozaj-lipaw", lang: "sl", variants: "rozaj-lipaw"},
@ -148,18 +149,40 @@ func parseTests() []parseTest {
{in: "en-c_cc-b-bbb-a-aaa", lang: "en", changed: true, extList: []string{"a-aaa", "b-bbb", "c-cc"}},
{in: "en-x_cc-b-bbb-a-aaa", lang: "en", ext: "x-cc-b-bbb-a-aaa", changed: true},
{in: "en-c_cc-b-bbb-a-aaa-x-x", lang: "en", changed: true, extList: []string{"a-aaa", "b-bbb", "c-cc", "x-x"}},
{in: "en-v-c", lang: "en", ext: "", invalid: true},
{in: "en-v-abcdefghi", lang: "en", ext: "", invalid: true},
{in: "en-v-abc-x", lang: "en", ext: "v-abc", invalid: true},
{in: "en-v-abc-x-", lang: "en", ext: "v-abc", invalid: true},
{in: "en-v-abc-w-x-xx", lang: "en", extList: []string{"v-abc", "x-xx"}, invalid: true, changed: true},
{in: "en-v-abc-w-y-yx", lang: "en", extList: []string{"v-abc", "y-yx"}, invalid: true, changed: true},
{in: "en-v-c-abc", lang: "en", ext: "c-abc", invalid: true, changed: true},
{in: "en-v-w-abc", lang: "en", ext: "w-abc", invalid: true, changed: true},
{in: "en-v-x-abc", lang: "en", ext: "x-abc", invalid: true, changed: true},
{in: "en-v-x-a", lang: "en", ext: "x-a", invalid: true, changed: true},
{in: "en-9-aa-0-aa-z-bb-x-a", lang: "en", extList: []string{"0-aa", "9-aa", "z-bb", "x-a"}, changed: true},
{in: "en-u-c", lang: "en", ext: "", invalid: true},
{in: "en-u-co-phonebk", lang: "en", ext: "u-co-phonebk"},
{in: "en-u-co-phonebk-ca", lang: "en", ext: "u-co-phonebk", invalid: true},
{in: "en-u-nu-arabic-co-phonebk-ca", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-x", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-s", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-a12345678", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-co-phonebook", lang: "en", ext: "", invalid: true},
{in: "en-u-co-phonebook-cu-xau", lang: "en", ext: "u-cu-xau", invalid: true, changed: true},
{in: "en-Cyrl-u-co-phonebk", lang: "en", script: "Cyrl", ext: "u-co-phonebk"},
{in: "en-US-u-co-phonebk", lang: "en", region: "US", ext: "u-co-phonebk"},
{in: "en-US-u-co-phonebk-cu-xau", lang: "en", region: "US", ext: "u-co-phonebk-cu-xau"},
{in: "en-scotland-u-co-phonebk", lang: "en", variants: "scotland", ext: "u-co-phonebk"},
{in: "en-u-cu-xua-co-phonebk", lang: "en", ext: "u-co-phonebk-cu-xua", changed: true},
{in: "en-u-def-abc-cu-xua-co-phonebk", lang: "en", ext: "u-def-abc-co-phonebk-cu-xua", changed: true},
{in: "en-u-def-abc", lang: "en", ext: "u-def-abc"},
{in: "en-u-def-abc-cu-xua-co-phonebk", lang: "en", ext: "u-abc-def-co-phonebk-cu-xua", changed: true},
{in: "en-u-def-abc", lang: "en", ext: "u-abc-def", changed: true},
{in: "en-u-cu-xua-co-phonebk-a-cd", lang: "en", extList: []string{"a-cd", "u-co-phonebk-cu-xua"}, changed: true},
// Invalid "u" extension. Drop invalid parts.
{in: "en-u-cu-co-phonebk", lang: "en", extList: []string{"u-co-phonebk"}, invalid: true, changed: true},
{in: "en-u-cu-xau-co", lang: "en", extList: []string{"u-cu-xau"}, invalid: true},
// We allow duplicate keys as the LDML spec does not explicitly prohibit it.
// TODO: Consider eliminating duplicates and returning an error.
{in: "en-u-cu-xau-co-phonebk-cu-xau", lang: "en", ext: "u-co-phonebk-cu-xau-cu-xau", changed: true},
{in: "en-t-en-Cyrl-NL-fonipa", lang: "en", ext: "t-en-cyrl-nl-fonipa", changed: true},
{in: "en-t-en-Cyrl-NL-fonipa-t0-abc-def", lang: "en", ext: "t-en-cyrl-nl-fonipa-t0-abc-def", changed: true},
{in: "en-t-t0-abcd", lang: "en", ext: "t-t0-abcd"},
@ -293,6 +316,9 @@ func TestParse(t *testing.T) {
ext = (*id.str)[id.pExt+1:]
}
}
if tag, _ := Parse(id.String()); tag.String() != id.String() {
t.Errorf("%d: reparse was %q; want %q", tt.i, id.String(), tag.String())
}
if ext != tt.ext {
t.Errorf("%d: ext was %q; want %q", tt.i, ext, tt.ext)
}