зеркало из https://github.com/mozilla/gecko-dev.git
Merge mozilla-central to autoland. a=merge CLOSED TREE
This commit is contained in:
Коммит
f2ab2502d7
|
@ -44,7 +44,6 @@ mozilla.pth:third_party/python/attrs/src
|
|||
python2:mozilla.pth:third_party/python/backports
|
||||
mozilla.pth:third_party/python/blessings
|
||||
mozilla.pth:third_party/python/cbor2
|
||||
mozilla.pth:third_party/python/chardet
|
||||
mozilla.pth:third_party/python/Click
|
||||
mozilla.pth:third_party/python/compare-locales
|
||||
mozilla.pth:third_party/python/cookies
|
||||
|
@ -60,7 +59,6 @@ mozilla.pth:third_party/python/fluent.syntax
|
|||
mozilla.pth:third_party/python/funcsigs
|
||||
python2:mozilla.pth:third_party/python/futures
|
||||
mozilla.pth:third_party/python/gyp/pylib
|
||||
mozilla.pth:third_party/python/idna
|
||||
mozilla.pth:third_party/python/importlib_metadata
|
||||
mozilla.pth:third_party/python/iso8601
|
||||
mozilla.pth:third_party/python/Jinja2/src
|
||||
|
@ -99,8 +97,6 @@ mozilla.pth:third_party/python/six
|
|||
mozilla.pth:third_party/python/slugid
|
||||
mozilla.pth:third_party/python/taskcluster
|
||||
mozilla.pth:third_party/python/taskcluster-urls
|
||||
python2:mozilla.pth:third_party/python/typing_extensions/src_py2
|
||||
python3:mozilla.pth:third_party/python/typing_extensions/src_py3
|
||||
mozilla.pth:third_party/python/urllib3/src
|
||||
mozilla.pth:third_party/python/voluptuous
|
||||
mozilla.pth:third_party/python/yamllint
|
||||
|
|
|
@ -1,15 +1,5 @@
|
|||
.. currentmodule:: jinja2
|
||||
|
||||
Version 2.11.3
|
||||
--------------
|
||||
|
||||
Released 2021-01-31
|
||||
|
||||
- Improve the speed of the ``urlize`` filter by reducing regex
|
||||
backtracking. Email matching requires a word character at the start
|
||||
of the domain part, and only word characters in the TLD. :pr:`1343`
|
||||
|
||||
|
||||
Version 2.11.2
|
||||
--------------
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
Metadata-Version: 2.1
|
||||
Name: Jinja2
|
||||
Version: 2.11.3
|
||||
Version: 2.11.2
|
||||
Summary: A very fast and expressive template engine.
|
||||
Home-page: https://palletsprojects.com/p/jinja/
|
||||
Author: Armin Ronacher
|
||||
|
|
|
@ -41,4 +41,4 @@ from .utils import evalcontextfunction
|
|||
from .utils import is_undefined
|
||||
from .utils import select_autoescape
|
||||
|
||||
__version__ = "2.11.3"
|
||||
__version__ = "2.11.2"
|
||||
|
|
|
@ -268,16 +268,16 @@ def do_dictsort(value, case_sensitive=False, by="key", reverse=False):
|
|||
|
||||
.. sourcecode:: jinja
|
||||
|
||||
{% for key, value in mydict|dictsort %}
|
||||
{% for item in mydict|dictsort %}
|
||||
sort the dict by key, case insensitive
|
||||
|
||||
{% for key, value in mydict|dictsort(reverse=true) %}
|
||||
{% for item in mydict|dictsort(reverse=true) %}
|
||||
sort the dict by key, case insensitive, reverse order
|
||||
|
||||
{% for key, value in mydict|dictsort(true) %}
|
||||
{% for item in mydict|dictsort(true) %}
|
||||
sort the dict by key, case sensitive
|
||||
|
||||
{% for key, value in mydict|dictsort(false, 'value') %}
|
||||
{% for item in mydict|dictsort(false, 'value') %}
|
||||
sort the dict by value, case insensitive
|
||||
"""
|
||||
if by == "key":
|
||||
|
|
|
@ -6,8 +6,6 @@ import warnings
|
|||
from collections import deque
|
||||
from random import choice
|
||||
from random import randrange
|
||||
from string import ascii_letters as _letters
|
||||
from string import digits as _digits
|
||||
from threading import Lock
|
||||
|
||||
from markupsafe import escape
|
||||
|
@ -18,6 +16,20 @@ from ._compat import string_types
|
|||
from ._compat import text_type
|
||||
from ._compat import url_quote
|
||||
|
||||
_word_split_re = re.compile(r"(\s+)")
|
||||
_punctuation_re = re.compile(
|
||||
"^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$"
|
||||
% (
|
||||
"|".join(map(re.escape, ("(", "<", "<"))),
|
||||
"|".join(map(re.escape, (".", ",", ")", ">", "\n", ">"))),
|
||||
)
|
||||
)
|
||||
_simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$")
|
||||
_striptags_re = re.compile(r"(<!--.*?-->|<[^>]*>)")
|
||||
_entity_re = re.compile(r"&([^;]+);")
|
||||
_letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
_digits = "0123456789"
|
||||
|
||||
# special singleton representing missing values for the runtime
|
||||
missing = type("MissingType", (), {"__repr__": lambda x: "missing"})()
|
||||
|
||||
|
@ -198,65 +210,48 @@ def urlize(text, trim_url_limit=None, rel=None, target=None):
|
|||
and (x[:limit] + (len(x) >= limit and "..." or ""))
|
||||
or x
|
||||
)
|
||||
words = re.split(r"(\s+)", text_type(escape(text)))
|
||||
words = _word_split_re.split(text_type(escape(text)))
|
||||
rel_attr = rel and ' rel="%s"' % text_type(escape(rel)) or ""
|
||||
target_attr = target and ' target="%s"' % escape(target) or ""
|
||||
|
||||
for i, word in enumerate(words):
|
||||
head, middle, tail = "", word, ""
|
||||
match = re.match(r"^([(<]|<)+", middle)
|
||||
|
||||
match = _punctuation_re.match(word)
|
||||
if match:
|
||||
head = match.group()
|
||||
middle = middle[match.end() :]
|
||||
|
||||
# Unlike lead, which is anchored to the start of the string,
|
||||
# need to check that the string ends with any of the characters
|
||||
# before trying to match all of them, to avoid backtracking.
|
||||
if middle.endswith((")", ">", ".", ",", "\n", ">")):
|
||||
match = re.search(r"([)>.,\n]|>)+$", middle)
|
||||
|
||||
if match:
|
||||
tail = match.group()
|
||||
middle = middle[: match.start()]
|
||||
|
||||
if middle.startswith("www.") or (
|
||||
"@" not in middle
|
||||
and not middle.startswith("http://")
|
||||
and not middle.startswith("https://")
|
||||
and len(middle) > 0
|
||||
and middle[0] in _letters + _digits
|
||||
and (
|
||||
middle.endswith(".org")
|
||||
or middle.endswith(".net")
|
||||
or middle.endswith(".com")
|
||||
)
|
||||
):
|
||||
middle = '<a href="http://%s"%s%s>%s</a>' % (
|
||||
middle,
|
||||
rel_attr,
|
||||
target_attr,
|
||||
trim_url(middle),
|
||||
)
|
||||
|
||||
if middle.startswith("http://") or middle.startswith("https://"):
|
||||
middle = '<a href="%s"%s%s>%s</a>' % (
|
||||
middle,
|
||||
rel_attr,
|
||||
target_attr,
|
||||
trim_url(middle),
|
||||
)
|
||||
|
||||
if (
|
||||
"@" in middle
|
||||
and not middle.startswith("www.")
|
||||
and ":" not in middle
|
||||
and re.match(r"^\S+@\w[\w.-]*\.\w+$", middle)
|
||||
):
|
||||
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
|
||||
|
||||
words[i] = head + middle + tail
|
||||
|
||||
lead, middle, trail = match.groups()
|
||||
if middle.startswith("www.") or (
|
||||
"@" not in middle
|
||||
and not middle.startswith("http://")
|
||||
and not middle.startswith("https://")
|
||||
and len(middle) > 0
|
||||
and middle[0] in _letters + _digits
|
||||
and (
|
||||
middle.endswith(".org")
|
||||
or middle.endswith(".net")
|
||||
or middle.endswith(".com")
|
||||
)
|
||||
):
|
||||
middle = '<a href="http://%s"%s%s>%s</a>' % (
|
||||
middle,
|
||||
rel_attr,
|
||||
target_attr,
|
||||
trim_url(middle),
|
||||
)
|
||||
if middle.startswith("http://") or middle.startswith("https://"):
|
||||
middle = '<a href="%s"%s%s>%s</a>' % (
|
||||
middle,
|
||||
rel_attr,
|
||||
target_attr,
|
||||
trim_url(middle),
|
||||
)
|
||||
if (
|
||||
"@" in middle
|
||||
and not middle.startswith("www.")
|
||||
and ":" not in middle
|
||||
and _simple_email_re.match(middle)
|
||||
):
|
||||
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
|
||||
if lead + middle + trail != word:
|
||||
words[i] = lead + middle + trail
|
||||
return u"".join(words)
|
||||
|
||||
|
||||
|
|
|
@ -4,18 +4,6 @@ For a complete changelog, see:
|
|||
* https://github.com/yaml/pyyaml/commits/
|
||||
* https://bitbucket.org/xi/pyyaml/commits/
|
||||
|
||||
5.4.1 (2021-01-20)
|
||||
|
||||
* https://github.com/yaml/pyyaml/pull/480 -- Fix stub compat with older pyyaml versions that may unwittingly load it
|
||||
|
||||
5.4 (2021-01-19)
|
||||
|
||||
* https://github.com/yaml/pyyaml/pull/407 -- Build modernization, remove distutils, fix metadata, build wheels, CI to GHA
|
||||
* https://github.com/yaml/pyyaml/pull/472 -- Fix for CVE-2020-14343, moves arbitrary python tags to UnsafeLoader
|
||||
* https://github.com/yaml/pyyaml/pull/441 -- Fix memory leak in implicit resolver setup
|
||||
* https://github.com/yaml/pyyaml/pull/392 -- Fix py2 copy support for timezone objects
|
||||
* https://github.com/yaml/pyyaml/pull/378 -- Fix compatibility with Jython
|
||||
|
||||
5.3.1 (2020-03-18)
|
||||
|
||||
* https://github.com/yaml/pyyaml/pull/386 -- Prevents arbitrary code execution during python/object/new constructor
|
||||
|
@ -23,7 +11,7 @@ For a complete changelog, see:
|
|||
5.3 (2020-01-06)
|
||||
|
||||
* https://github.com/yaml/pyyaml/pull/290 -- Use `is` instead of equality for comparing with `None`
|
||||
* https://github.com/yaml/pyyaml/pull/270 -- Fix typos and stylistic nit
|
||||
* https://github.com/yaml/pyyaml/pull/270 -- fix typos and stylistic nit
|
||||
* https://github.com/yaml/pyyaml/pull/309 -- Fix up small typo
|
||||
* https://github.com/yaml/pyyaml/pull/161 -- Fix handling of __slots__
|
||||
* https://github.com/yaml/pyyaml/pull/358 -- Allow calling add_multi_constructor with None
|
||||
|
@ -33,8 +21,8 @@ For a complete changelog, see:
|
|||
* https://github.com/yaml/pyyaml/pull/359 -- Use full_load in yaml-highlight example
|
||||
* https://github.com/yaml/pyyaml/pull/244 -- Document that PyYAML is implemented with Cython
|
||||
* https://github.com/yaml/pyyaml/pull/329 -- Fix for Python 3.10
|
||||
* https://github.com/yaml/pyyaml/pull/310 -- Increase size of index, line, and column fields
|
||||
* https://github.com/yaml/pyyaml/pull/260 -- Remove some unused imports
|
||||
* https://github.com/yaml/pyyaml/pull/310 -- increase size of index, line, and column fields
|
||||
* https://github.com/yaml/pyyaml/pull/260 -- remove some unused imports
|
||||
* https://github.com/yaml/pyyaml/pull/163 -- Create timezone-aware datetimes when parsed as such
|
||||
* https://github.com/yaml/pyyaml/pull/363 -- Add tests for timezone
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
Copyright (c) 2017-2021 Ingy döt Net
|
||||
Copyright (c) 2017-2020 Ingy döt Net
|
||||
Copyright (c) 2006-2016 Kirill Simonov
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
|
|
|
@ -1,10 +0,0 @@
|
|||
include CHANGES README LICENSE Makefile pyproject.toml setup.py
|
||||
recursive-include lib/yaml *.py
|
||||
recursive-include lib/_yaml *.py
|
||||
recursive-include lib3/yaml *.py
|
||||
recursive-include lib3/_yaml *.py
|
||||
recursive-include examples *.py *.cfg *.yaml
|
||||
recursive-include tests/data *
|
||||
recursive-include tests/lib *.py
|
||||
recursive-include tests/lib3 *.py
|
||||
recursive-include yaml *
|
|
@ -1,44 +0,0 @@
|
|||
|
||||
.PHONY: default build buildext force forceext install installext test testext dist clean
|
||||
|
||||
PYTHON=/usr/bin/python
|
||||
TEST=
|
||||
PARAMETERS=
|
||||
|
||||
build:
|
||||
${PYTHON} setup.py build ${PARAMETERS}
|
||||
|
||||
buildext:
|
||||
${PYTHON} setup.py --with-libyaml build ${PARAMETERS}
|
||||
|
||||
force:
|
||||
${PYTHON} setup.py build -f ${PARAMETERS}
|
||||
|
||||
forceext:
|
||||
${PYTHON} setup.py --with-libyaml build -f ${PARAMETERS}
|
||||
|
||||
install:
|
||||
${PYTHON} setup.py install ${PARAMETERS}
|
||||
|
||||
installext:
|
||||
${PYTHON} setup.py --with-libyaml install ${PARAMETERS}
|
||||
|
||||
test: build
|
||||
${PYTHON} tests/lib/test_build.py ${TEST}
|
||||
|
||||
testext: buildext
|
||||
${PYTHON} tests/lib/test_build_ext.py ${TEST}
|
||||
|
||||
testall:
|
||||
${PYTHON} setup.py test
|
||||
|
||||
dist:
|
||||
@# No longer uploading a zip file to pypi
|
||||
@# ${PYTHON} setup.py --with-libyaml sdist --formats=zip,gztar
|
||||
${PYTHON} setup.py --with-libyaml sdist --formats=gztar
|
||||
|
||||
windist:
|
||||
${PYTHON} setup.py --with-libyaml bdist_wininst
|
||||
|
||||
clean:
|
||||
${PYTHON} setup.py --with-libyaml clean -a
|
|
@ -1,17 +1,12 @@
|
|||
Metadata-Version: 1.2
|
||||
Metadata-Version: 1.1
|
||||
Name: PyYAML
|
||||
Version: 5.4.1
|
||||
Version: 5.3.1
|
||||
Summary: YAML parser and emitter for Python
|
||||
Home-page: https://pyyaml.org/
|
||||
Home-page: https://github.com/yaml/pyyaml
|
||||
Author: Kirill Simonov
|
||||
Author-email: xi@resolvent.net
|
||||
License: MIT
|
||||
Download-URL: https://pypi.org/project/PyYAML/
|
||||
Project-URL: Bug Tracker, https://github.com/yaml/pyyaml/issues
|
||||
Project-URL: CI, https://github.com/yaml/pyyaml/actions
|
||||
Project-URL: Documentation, https://pyyaml.org/wiki/PyYAMLDocumentation
|
||||
Project-URL: Mailing lists, http://lists.sourceforge.net/lists/listinfo/yaml-core
|
||||
Project-URL: Source Code, https://github.com/yaml/pyyaml
|
||||
Description: YAML is a data serialization format designed for human readability
|
||||
and interaction with scripting languages. PyYAML is a YAML parser
|
||||
and emitter for Python.
|
||||
|
@ -33,12 +28,11 @@ Classifier: Programming Language :: Python
|
|||
Classifier: Programming Language :: Python :: 2
|
||||
Classifier: Programming Language :: Python :: 2.7
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.5
|
||||
Classifier: Programming Language :: Python :: 3.6
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: Implementation :: CPython
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Topic :: Text Processing :: Markup
|
||||
Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,33 +0,0 @@
|
|||
# This is a stub package designed to roughly emulate the _yaml
|
||||
# extension module, which previously existed as a standalone module
|
||||
# and has been moved into the `yaml` package namespace.
|
||||
# It does not perfectly mimic its old counterpart, but should get
|
||||
# close enough for anyone who's relying on it even when they shouldn't.
|
||||
import yaml
|
||||
|
||||
# in some circumstances, the yaml module we imoprted may be from a different version, so we need
|
||||
# to tread carefully when poking at it here (it may not have the attributes we expect)
|
||||
if not getattr(yaml, '__with_libyaml__', False):
|
||||
from sys import version_info
|
||||
|
||||
exc = ModuleNotFoundError if version_info >= (3, 6) else ImportError
|
||||
raise exc("No module named '_yaml'")
|
||||
else:
|
||||
from yaml._yaml import *
|
||||
import warnings
|
||||
warnings.warn(
|
||||
'The _yaml extension module is now located at yaml._yaml'
|
||||
' and its location is subject to change. To use the'
|
||||
' LibYAML-based parser and emitter, import from `yaml`:'
|
||||
' `from yaml import CLoader as Loader, CDumper as Dumper`.',
|
||||
DeprecationWarning
|
||||
)
|
||||
del warnings
|
||||
# Don't `del yaml` here because yaml is actually an existing
|
||||
# namespace member of _yaml.
|
||||
|
||||
__name__ = '_yaml'
|
||||
# If the module is top-level (i.e. not a part of any specific package)
|
||||
# then the attribute should be set to ''.
|
||||
# https://docs.python.org/3.8/library/types.html
|
||||
__package__ = ''
|
|
@ -8,7 +8,7 @@ from nodes import *
|
|||
from loader import *
|
||||
from dumper import *
|
||||
|
||||
__version__ = '5.4.1'
|
||||
__version__ = '5.3.1'
|
||||
|
||||
try:
|
||||
from cyaml import *
|
||||
|
|
|
@ -38,12 +38,6 @@ class timezone(datetime.tzinfo):
|
|||
def dst(self, dt=None):
|
||||
return datetime.timedelta(0)
|
||||
|
||||
def __copy__(self):
|
||||
return self.__deepcopy__()
|
||||
|
||||
def __deepcopy__(self, memodict={}):
|
||||
return self.__class__(self.utcoffset())
|
||||
|
||||
__repr__ = __str__ = tzname
|
||||
|
||||
|
||||
|
@ -728,6 +722,18 @@ FullConstructor.add_multi_constructor(
|
|||
u'tag:yaml.org,2002:python/name:',
|
||||
FullConstructor.construct_python_name)
|
||||
|
||||
FullConstructor.add_multi_constructor(
|
||||
u'tag:yaml.org,2002:python/module:',
|
||||
FullConstructor.construct_python_module)
|
||||
|
||||
FullConstructor.add_multi_constructor(
|
||||
u'tag:yaml.org,2002:python/object:',
|
||||
FullConstructor.construct_python_object)
|
||||
|
||||
FullConstructor.add_multi_constructor(
|
||||
u'tag:yaml.org,2002:python/object/new:',
|
||||
FullConstructor.construct_python_object_new)
|
||||
|
||||
class UnsafeConstructor(FullConstructor):
|
||||
|
||||
def find_python_module(self, name, mark):
|
||||
|
@ -744,18 +750,6 @@ class UnsafeConstructor(FullConstructor):
|
|||
return super(UnsafeConstructor, self).set_python_instance_state(
|
||||
instance, state, unsafe=True)
|
||||
|
||||
UnsafeConstructor.add_multi_constructor(
|
||||
u'tag:yaml.org,2002:python/module:',
|
||||
UnsafeConstructor.construct_python_module)
|
||||
|
||||
UnsafeConstructor.add_multi_constructor(
|
||||
u'tag:yaml.org,2002:python/object:',
|
||||
UnsafeConstructor.construct_python_object)
|
||||
|
||||
UnsafeConstructor.add_multi_constructor(
|
||||
u'tag:yaml.org,2002:python/object/new:',
|
||||
UnsafeConstructor.construct_python_object_new)
|
||||
|
||||
UnsafeConstructor.add_multi_constructor(
|
||||
u'tag:yaml.org,2002:python/object/apply:',
|
||||
UnsafeConstructor.construct_python_object_apply)
|
||||
|
|
|
@ -4,7 +4,7 @@ __all__ = [
|
|||
'CBaseDumper', 'CSafeDumper', 'CDumper'
|
||||
]
|
||||
|
||||
from yaml._yaml import CParser, CEmitter
|
||||
from _yaml import CParser, CEmitter
|
||||
|
||||
from constructor import *
|
||||
|
||||
|
|
|
@ -137,14 +137,9 @@ class Reader(object):
|
|||
self.update(1)
|
||||
|
||||
if has_ucs4:
|
||||
NON_PRINTABLE = u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]'
|
||||
elif sys.platform.startswith('java'):
|
||||
# Jython doesn't support lone surrogates https://bugs.jython.org/issue2048
|
||||
NON_PRINTABLE = u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]'
|
||||
NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]')
|
||||
else:
|
||||
# Need to use eval here due to the above Jython issue
|
||||
NON_PRINTABLE = eval(r"u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uFFFD]|(?:^|[^\uD800-\uDBFF])[\uDC00-\uDFFF]|[\uD800-\uDBFF](?:[^\uDC00-\uDFFF]|$)'")
|
||||
NON_PRINTABLE = re.compile(NON_PRINTABLE)
|
||||
NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uFFFD]|(?:^|[^\uD800-\uDBFF])[\uDC00-\uDFFF]|[\uD800-\uDBFF](?:[^\uDC00-\uDFFF]|$)')
|
||||
def check_printable(self, data):
|
||||
match = self.NON_PRINTABLE.search(data)
|
||||
if match:
|
||||
|
|
|
@ -146,8 +146,8 @@ class BaseResolver(object):
|
|||
resolvers = self.yaml_implicit_resolvers.get(u'', [])
|
||||
else:
|
||||
resolvers = self.yaml_implicit_resolvers.get(value[0], [])
|
||||
wildcard_resolvers = self.yaml_implicit_resolvers.get(None, [])
|
||||
for tag, regexp in resolvers + wildcard_resolvers:
|
||||
resolvers += self.yaml_implicit_resolvers.get(None, [])
|
||||
for tag, regexp in resolvers:
|
||||
if regexp.match(value):
|
||||
return tag
|
||||
implicit = implicit[1]
|
||||
|
|
|
@ -1,33 +0,0 @@
|
|||
# This is a stub package designed to roughly emulate the _yaml
|
||||
# extension module, which previously existed as a standalone module
|
||||
# and has been moved into the `yaml` package namespace.
|
||||
# It does not perfectly mimic its old counterpart, but should get
|
||||
# close enough for anyone who's relying on it even when they shouldn't.
|
||||
import yaml
|
||||
|
||||
# in some circumstances, the yaml module we imoprted may be from a different version, so we need
|
||||
# to tread carefully when poking at it here (it may not have the attributes we expect)
|
||||
if not getattr(yaml, '__with_libyaml__', False):
|
||||
from sys import version_info
|
||||
|
||||
exc = ModuleNotFoundError if version_info >= (3, 6) else ImportError
|
||||
raise exc("No module named '_yaml'")
|
||||
else:
|
||||
from yaml._yaml import *
|
||||
import warnings
|
||||
warnings.warn(
|
||||
'The _yaml extension module is now located at yaml._yaml'
|
||||
' and its location is subject to change. To use the'
|
||||
' LibYAML-based parser and emitter, import from `yaml`:'
|
||||
' `from yaml import CLoader as Loader, CDumper as Dumper`.',
|
||||
DeprecationWarning
|
||||
)
|
||||
del warnings
|
||||
# Don't `del yaml` here because yaml is actually an existing
|
||||
# namespace member of _yaml.
|
||||
|
||||
__name__ = '_yaml'
|
||||
# If the module is top-level (i.e. not a part of any specific package)
|
||||
# then the attribute should be set to ''.
|
||||
# https://docs.python.org/3.8/library/types.html
|
||||
__package__ = ''
|
|
@ -8,7 +8,7 @@ from .nodes import *
|
|||
from .loader import *
|
||||
from .dumper import *
|
||||
|
||||
__version__ = '5.4.1'
|
||||
__version__ = '5.3.1'
|
||||
try:
|
||||
from .cyaml import *
|
||||
__with_libyaml__ = True
|
||||
|
|
|
@ -710,6 +710,18 @@ FullConstructor.add_multi_constructor(
|
|||
'tag:yaml.org,2002:python/name:',
|
||||
FullConstructor.construct_python_name)
|
||||
|
||||
FullConstructor.add_multi_constructor(
|
||||
'tag:yaml.org,2002:python/module:',
|
||||
FullConstructor.construct_python_module)
|
||||
|
||||
FullConstructor.add_multi_constructor(
|
||||
'tag:yaml.org,2002:python/object:',
|
||||
FullConstructor.construct_python_object)
|
||||
|
||||
FullConstructor.add_multi_constructor(
|
||||
'tag:yaml.org,2002:python/object/new:',
|
||||
FullConstructor.construct_python_object_new)
|
||||
|
||||
class UnsafeConstructor(FullConstructor):
|
||||
|
||||
def find_python_module(self, name, mark):
|
||||
|
@ -726,18 +738,6 @@ class UnsafeConstructor(FullConstructor):
|
|||
return super(UnsafeConstructor, self).set_python_instance_state(
|
||||
instance, state, unsafe=True)
|
||||
|
||||
UnsafeConstructor.add_multi_constructor(
|
||||
'tag:yaml.org,2002:python/module:',
|
||||
UnsafeConstructor.construct_python_module)
|
||||
|
||||
UnsafeConstructor.add_multi_constructor(
|
||||
'tag:yaml.org,2002:python/object:',
|
||||
UnsafeConstructor.construct_python_object)
|
||||
|
||||
UnsafeConstructor.add_multi_constructor(
|
||||
'tag:yaml.org,2002:python/object/new:',
|
||||
UnsafeConstructor.construct_python_object_new)
|
||||
|
||||
UnsafeConstructor.add_multi_constructor(
|
||||
'tag:yaml.org,2002:python/object/apply:',
|
||||
UnsafeConstructor.construct_python_object_apply)
|
||||
|
|
|
@ -4,7 +4,7 @@ __all__ = [
|
|||
'CBaseDumper', 'CSafeDumper', 'CDumper'
|
||||
]
|
||||
|
||||
from yaml._yaml import CParser, CEmitter
|
||||
from _yaml import CParser, CEmitter
|
||||
|
||||
from .constructor import *
|
||||
|
||||
|
|
|
@ -146,8 +146,8 @@ class BaseResolver:
|
|||
resolvers = self.yaml_implicit_resolvers.get('', [])
|
||||
else:
|
||||
resolvers = self.yaml_implicit_resolvers.get(value[0], [])
|
||||
wildcard_resolvers = self.yaml_implicit_resolvers.get(None, [])
|
||||
for tag, regexp in resolvers + wildcard_resolvers:
|
||||
resolvers += self.yaml_implicit_resolvers.get(None, [])
|
||||
for tag, regexp in resolvers:
|
||||
if regexp.match(value):
|
||||
return tag
|
||||
implicit = implicit[1]
|
||||
|
|
|
@ -1,3 +0,0 @@
|
|||
[build-system]
|
||||
requires = ["setuptools", "wheel", "Cython"]
|
||||
build-backend = "setuptools.build_meta"
|
|
@ -1,9 +1,25 @@
|
|||
|
||||
# The INCLUDE and LIB directories to build the '_yaml' extension.
|
||||
# You may also set them using the options '-I' and '-L'.
|
||||
[build_ext]
|
||||
|
||||
# List of directories to search for 'yaml.h' (separated by ':').
|
||||
#include_dirs=/usr/local/include:../../include
|
||||
|
||||
# List of directories to search for 'libyaml.a' (separated by ':').
|
||||
#library_dirs=/usr/local/lib:../../lib
|
||||
|
||||
# An alternative compiler to build the extension.
|
||||
#compiler=mingw32
|
||||
|
||||
# Additional preprocessor definitions might be required.
|
||||
#define=YAML_DECLARE_STATIC
|
||||
|
||||
# The following options are used to build PyYAML Windows installer
|
||||
# for Python 2.7 on my PC:
|
||||
#include_dirs=../../../libyaml/tags/0.1.4/include
|
||||
#library_dirs=../../../libyaml/tags/0.1.4/win32/vs2008/output/release/lib
|
||||
#define=YAML_DECLARE_STATIC
|
||||
|
||||
[metadata]
|
||||
license_file = LICENSE
|
||||
|
||||
[egg_info]
|
||||
tag_build =
|
||||
tag_date = 0
|
||||
|
||||
license_file = LICENSE
|
|
@ -1,6 +1,6 @@
|
|||
|
||||
NAME = 'PyYAML'
|
||||
VERSION = '5.4.1'
|
||||
VERSION = '5.3.1'
|
||||
DESCRIPTION = "YAML parser and emitter for Python"
|
||||
LONG_DESCRIPTION = """\
|
||||
YAML is a data serialization format designed for human readability
|
||||
|
@ -18,7 +18,7 @@ AUTHOR = "Kirill Simonov"
|
|||
AUTHOR_EMAIL = 'xi@resolvent.net'
|
||||
LICENSE = "MIT"
|
||||
PLATFORMS = "Any"
|
||||
URL = "https://pyyaml.org/"
|
||||
URL = "https://github.com/yaml/pyyaml"
|
||||
DOWNLOAD_URL = "https://pypi.org/project/PyYAML/"
|
||||
CLASSIFIERS = [
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
|
@ -30,22 +30,16 @@ CLASSIFIERS = [
|
|||
"Programming Language :: Python :: 2",
|
||||
"Programming Language :: Python :: 2.7",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.5",
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: Implementation :: CPython",
|
||||
"Programming Language :: Python :: Implementation :: PyPy",
|
||||
"Topic :: Software Development :: Libraries :: Python Modules",
|
||||
"Topic :: Text Processing :: Markup",
|
||||
]
|
||||
PROJECT_URLS = {
|
||||
'Bug Tracker': 'https://github.com/yaml/pyyaml/issues',
|
||||
'CI': 'https://github.com/yaml/pyyaml/actions',
|
||||
'Documentation': 'https://pyyaml.org/wiki/PyYAMLDocumentation',
|
||||
'Mailing lists': 'http://lists.sourceforge.net/lists/listinfo/yaml-core',
|
||||
'Source Code': 'https://github.com/yaml/pyyaml',
|
||||
}
|
||||
|
||||
|
||||
LIBYAML_CHECK = """
|
||||
#include <yaml.h>
|
||||
|
@ -65,15 +59,24 @@ int main(void) {
|
|||
"""
|
||||
|
||||
|
||||
import sys, os, os.path, platform, warnings
|
||||
import sys, os.path, platform, warnings
|
||||
|
||||
from distutils import log
|
||||
from setuptools import setup, Command, Distribution as _Distribution, Extension as _Extension
|
||||
from setuptools.command.build_ext import build_ext as _build_ext
|
||||
from distutils.core import setup, Command
|
||||
from distutils.core import Distribution as _Distribution
|
||||
from distutils.core import Extension as _Extension
|
||||
from distutils.command.build_ext import build_ext as _build_ext
|
||||
from distutils.command.bdist_rpm import bdist_rpm as _bdist_rpm
|
||||
from distutils.errors import DistutilsError, CompileError, LinkError, DistutilsPlatformError
|
||||
|
||||
if 'setuptools.extension' in sys.modules:
|
||||
_Extension = sys.modules['setuptools.extension']._Extension
|
||||
sys.modules['distutils.core'].Extension = _Extension
|
||||
sys.modules['distutils.extension'].Extension = _Extension
|
||||
sys.modules['distutils.command.build_ext'].Extension = _Extension
|
||||
|
||||
with_cython = False
|
||||
if 'sdist' in sys.argv or os.environ.get('PYYAML_FORCE_CYTHON') == '1':
|
||||
if 'sdist' in sys.argv:
|
||||
# we need cython here
|
||||
with_cython = True
|
||||
try:
|
||||
|
@ -103,8 +106,8 @@ if platform.system() == 'Windows':
|
|||
for w in windows_ignore_warnings:
|
||||
warnings.filterwarnings('ignore', w)
|
||||
|
||||
|
||||
class Distribution(_Distribution):
|
||||
|
||||
def __init__(self, attrs=None):
|
||||
_Distribution.__init__(self, attrs)
|
||||
if not self.ext_modules:
|
||||
|
@ -135,15 +138,10 @@ class Distribution(_Distribution):
|
|||
|
||||
def ext_status(self, ext):
|
||||
implementation = platform.python_implementation()
|
||||
if implementation not in ['CPython', 'PyPy']:
|
||||
if implementation != 'CPython':
|
||||
return False
|
||||
if isinstance(ext, Extension):
|
||||
# the "build by default" behavior is implemented by this returning None
|
||||
with_ext = getattr(self, ext.attr_name) or os.environ.get('PYYAML_FORCE_{0}'.format(ext.feature_name.upper()))
|
||||
try:
|
||||
with_ext = int(with_ext) # attempt coerce envvar to int
|
||||
except TypeError:
|
||||
pass
|
||||
with_ext = getattr(self, ext.attr_name)
|
||||
return with_ext
|
||||
else:
|
||||
return True
|
||||
|
@ -235,6 +233,27 @@ class build_ext(_build_ext):
|
|||
log.warn("Error compiling module, falling back to pure Python")
|
||||
|
||||
|
||||
class bdist_rpm(_bdist_rpm):
|
||||
|
||||
def _make_spec_file(self):
|
||||
argv0 = sys.argv[0]
|
||||
features = []
|
||||
for ext in self.distribution.ext_modules:
|
||||
if not isinstance(ext, Extension):
|
||||
continue
|
||||
with_ext = getattr(self.distribution, ext.attr_name)
|
||||
if with_ext is None:
|
||||
continue
|
||||
if with_ext:
|
||||
features.append('--'+ext.option_name)
|
||||
else:
|
||||
features.append('--'+ext.neg_option_name)
|
||||
sys.argv[0] = ' '.join([argv0]+features)
|
||||
spec_file = _bdist_rpm._make_spec_file(self)
|
||||
sys.argv[0] = argv0
|
||||
return spec_file
|
||||
|
||||
|
||||
class test(Command):
|
||||
|
||||
user_options = []
|
||||
|
@ -260,6 +279,7 @@ class test(Command):
|
|||
|
||||
cmdclass = {
|
||||
'build_ext': build_ext,
|
||||
'bdist_rpm': bdist_rpm,
|
||||
'test': test,
|
||||
}
|
||||
if bdist_wheel:
|
||||
|
@ -280,17 +300,16 @@ if __name__ == '__main__':
|
|||
url=URL,
|
||||
download_url=DOWNLOAD_URL,
|
||||
classifiers=CLASSIFIERS,
|
||||
project_urls=PROJECT_URLS,
|
||||
|
||||
package_dir={'': {2: 'lib', 3: 'lib3'}[sys.version_info[0]]},
|
||||
packages=['yaml', '_yaml'],
|
||||
packages=['yaml'],
|
||||
ext_modules=[
|
||||
Extension('yaml._yaml', ['yaml/_yaml.pyx'],
|
||||
Extension('_yaml', ['ext/_yaml.pyx'],
|
||||
'libyaml', "LibYAML bindings", LIBYAML_CHECK,
|
||||
libraries=['yaml']),
|
||||
],
|
||||
|
||||
distclass=Distribution,
|
||||
cmdclass=cmdclass,
|
||||
python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*',
|
||||
python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*',
|
||||
)
|
||||
|
|
|
@ -1,504 +0,0 @@
|
|||
GNU LESSER GENERAL PUBLIC LICENSE
|
||||
Version 2.1, February 1999
|
||||
|
||||
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
|
||||
51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
[This is the first released version of the Lesser GPL. It also counts
|
||||
as the successor of the GNU Library Public License, version 2, hence
|
||||
the version number 2.1.]
|
||||
|
||||
Preamble
|
||||
|
||||
The licenses for most software are designed to take away your
|
||||
freedom to share and change it. By contrast, the GNU General Public
|
||||
Licenses are intended to guarantee your freedom to share and change
|
||||
free software--to make sure the software is free for all its users.
|
||||
|
||||
This license, the Lesser General Public License, applies to some
|
||||
specially designated software packages--typically libraries--of the
|
||||
Free Software Foundation and other authors who decide to use it. You
|
||||
can use it too, but we suggest you first think carefully about whether
|
||||
this license or the ordinary General Public License is the better
|
||||
strategy to use in any particular case, based on the explanations below.
|
||||
|
||||
When we speak of free software, we are referring to freedom of use,
|
||||
not price. Our General Public Licenses are designed to make sure that
|
||||
you have the freedom to distribute copies of free software (and charge
|
||||
for this service if you wish); that you receive source code or can get
|
||||
it if you want it; that you can change the software and use pieces of
|
||||
it in new free programs; and that you are informed that you can do
|
||||
these things.
|
||||
|
||||
To protect your rights, we need to make restrictions that forbid
|
||||
distributors to deny you these rights or to ask you to surrender these
|
||||
rights. These restrictions translate to certain responsibilities for
|
||||
you if you distribute copies of the library or if you modify it.
|
||||
|
||||
For example, if you distribute copies of the library, whether gratis
|
||||
or for a fee, you must give the recipients all the rights that we gave
|
||||
you. You must make sure that they, too, receive or can get the source
|
||||
code. If you link other code with the library, you must provide
|
||||
complete object files to the recipients, so that they can relink them
|
||||
with the library after making changes to the library and recompiling
|
||||
it. And you must show them these terms so they know their rights.
|
||||
|
||||
We protect your rights with a two-step method: (1) we copyright the
|
||||
library, and (2) we offer you this license, which gives you legal
|
||||
permission to copy, distribute and/or modify the library.
|
||||
|
||||
To protect each distributor, we want to make it very clear that
|
||||
there is no warranty for the free library. Also, if the library is
|
||||
modified by someone else and passed on, the recipients should know
|
||||
that what they have is not the original version, so that the original
|
||||
author's reputation will not be affected by problems that might be
|
||||
introduced by others.
|
||||
|
||||
Finally, software patents pose a constant threat to the existence of
|
||||
any free program. We wish to make sure that a company cannot
|
||||
effectively restrict the users of a free program by obtaining a
|
||||
restrictive license from a patent holder. Therefore, we insist that
|
||||
any patent license obtained for a version of the library must be
|
||||
consistent with the full freedom of use specified in this license.
|
||||
|
||||
Most GNU software, including some libraries, is covered by the
|
||||
ordinary GNU General Public License. This license, the GNU Lesser
|
||||
General Public License, applies to certain designated libraries, and
|
||||
is quite different from the ordinary General Public License. We use
|
||||
this license for certain libraries in order to permit linking those
|
||||
libraries into non-free programs.
|
||||
|
||||
When a program is linked with a library, whether statically or using
|
||||
a shared library, the combination of the two is legally speaking a
|
||||
combined work, a derivative of the original library. The ordinary
|
||||
General Public License therefore permits such linking only if the
|
||||
entire combination fits its criteria of freedom. The Lesser General
|
||||
Public License permits more lax criteria for linking other code with
|
||||
the library.
|
||||
|
||||
We call this license the "Lesser" General Public License because it
|
||||
does Less to protect the user's freedom than the ordinary General
|
||||
Public License. It also provides other free software developers Less
|
||||
of an advantage over competing non-free programs. These disadvantages
|
||||
are the reason we use the ordinary General Public License for many
|
||||
libraries. However, the Lesser license provides advantages in certain
|
||||
special circumstances.
|
||||
|
||||
For example, on rare occasions, there may be a special need to
|
||||
encourage the widest possible use of a certain library, so that it becomes
|
||||
a de-facto standard. To achieve this, non-free programs must be
|
||||
allowed to use the library. A more frequent case is that a free
|
||||
library does the same job as widely used non-free libraries. In this
|
||||
case, there is little to gain by limiting the free library to free
|
||||
software only, so we use the Lesser General Public License.
|
||||
|
||||
In other cases, permission to use a particular library in non-free
|
||||
programs enables a greater number of people to use a large body of
|
||||
free software. For example, permission to use the GNU C Library in
|
||||
non-free programs enables many more people to use the whole GNU
|
||||
operating system, as well as its variant, the GNU/Linux operating
|
||||
system.
|
||||
|
||||
Although the Lesser General Public License is Less protective of the
|
||||
users' freedom, it does ensure that the user of a program that is
|
||||
linked with the Library has the freedom and the wherewithal to run
|
||||
that program using a modified version of the Library.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow. Pay close attention to the difference between a
|
||||
"work based on the library" and a "work that uses the library". The
|
||||
former contains code derived from the library, whereas the latter must
|
||||
be combined with the library in order to run.
|
||||
|
||||
GNU LESSER GENERAL PUBLIC LICENSE
|
||||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||
|
||||
0. This License Agreement applies to any software library or other
|
||||
program which contains a notice placed by the copyright holder or
|
||||
other authorized party saying it may be distributed under the terms of
|
||||
this Lesser General Public License (also called "this License").
|
||||
Each licensee is addressed as "you".
|
||||
|
||||
A "library" means a collection of software functions and/or data
|
||||
prepared so as to be conveniently linked with application programs
|
||||
(which use some of those functions and data) to form executables.
|
||||
|
||||
The "Library", below, refers to any such software library or work
|
||||
which has been distributed under these terms. A "work based on the
|
||||
Library" means either the Library or any derivative work under
|
||||
copyright law: that is to say, a work containing the Library or a
|
||||
portion of it, either verbatim or with modifications and/or translated
|
||||
straightforwardly into another language. (Hereinafter, translation is
|
||||
included without limitation in the term "modification".)
|
||||
|
||||
"Source code" for a work means the preferred form of the work for
|
||||
making modifications to it. For a library, complete source code means
|
||||
all the source code for all modules it contains, plus any associated
|
||||
interface definition files, plus the scripts used to control compilation
|
||||
and installation of the library.
|
||||
|
||||
Activities other than copying, distribution and modification are not
|
||||
covered by this License; they are outside its scope. The act of
|
||||
running a program using the Library is not restricted, and output from
|
||||
such a program is covered only if its contents constitute a work based
|
||||
on the Library (independent of the use of the Library in a tool for
|
||||
writing it). Whether that is true depends on what the Library does
|
||||
and what the program that uses the Library does.
|
||||
|
||||
1. You may copy and distribute verbatim copies of the Library's
|
||||
complete source code as you receive it, in any medium, provided that
|
||||
you conspicuously and appropriately publish on each copy an
|
||||
appropriate copyright notice and disclaimer of warranty; keep intact
|
||||
all the notices that refer to this License and to the absence of any
|
||||
warranty; and distribute a copy of this License along with the
|
||||
Library.
|
||||
|
||||
You may charge a fee for the physical act of transferring a copy,
|
||||
and you may at your option offer warranty protection in exchange for a
|
||||
fee.
|
||||
|
||||
2. You may modify your copy or copies of the Library or any portion
|
||||
of it, thus forming a work based on the Library, and copy and
|
||||
distribute such modifications or work under the terms of Section 1
|
||||
above, provided that you also meet all of these conditions:
|
||||
|
||||
a) The modified work must itself be a software library.
|
||||
|
||||
b) You must cause the files modified to carry prominent notices
|
||||
stating that you changed the files and the date of any change.
|
||||
|
||||
c) You must cause the whole of the work to be licensed at no
|
||||
charge to all third parties under the terms of this License.
|
||||
|
||||
d) If a facility in the modified Library refers to a function or a
|
||||
table of data to be supplied by an application program that uses
|
||||
the facility, other than as an argument passed when the facility
|
||||
is invoked, then you must make a good faith effort to ensure that,
|
||||
in the event an application does not supply such function or
|
||||
table, the facility still operates, and performs whatever part of
|
||||
its purpose remains meaningful.
|
||||
|
||||
(For example, a function in a library to compute square roots has
|
||||
a purpose that is entirely well-defined independent of the
|
||||
application. Therefore, Subsection 2d requires that any
|
||||
application-supplied function or table used by this function must
|
||||
be optional: if the application does not supply it, the square
|
||||
root function must still compute square roots.)
|
||||
|
||||
These requirements apply to the modified work as a whole. If
|
||||
identifiable sections of that work are not derived from the Library,
|
||||
and can be reasonably considered independent and separate works in
|
||||
themselves, then this License, and its terms, do not apply to those
|
||||
sections when you distribute them as separate works. But when you
|
||||
distribute the same sections as part of a whole which is a work based
|
||||
on the Library, the distribution of the whole must be on the terms of
|
||||
this License, whose permissions for other licensees extend to the
|
||||
entire whole, and thus to each and every part regardless of who wrote
|
||||
it.
|
||||
|
||||
Thus, it is not the intent of this section to claim rights or contest
|
||||
your rights to work written entirely by you; rather, the intent is to
|
||||
exercise the right to control the distribution of derivative or
|
||||
collective works based on the Library.
|
||||
|
||||
In addition, mere aggregation of another work not based on the Library
|
||||
with the Library (or with a work based on the Library) on a volume of
|
||||
a storage or distribution medium does not bring the other work under
|
||||
the scope of this License.
|
||||
|
||||
3. You may opt to apply the terms of the ordinary GNU General Public
|
||||
License instead of this License to a given copy of the Library. To do
|
||||
this, you must alter all the notices that refer to this License, so
|
||||
that they refer to the ordinary GNU General Public License, version 2,
|
||||
instead of to this License. (If a newer version than version 2 of the
|
||||
ordinary GNU General Public License has appeared, then you can specify
|
||||
that version instead if you wish.) Do not make any other change in
|
||||
these notices.
|
||||
|
||||
Once this change is made in a given copy, it is irreversible for
|
||||
that copy, so the ordinary GNU General Public License applies to all
|
||||
subsequent copies and derivative works made from that copy.
|
||||
|
||||
This option is useful when you wish to copy part of the code of
|
||||
the Library into a program that is not a library.
|
||||
|
||||
4. You may copy and distribute the Library (or a portion or
|
||||
derivative of it, under Section 2) in object code or executable form
|
||||
under the terms of Sections 1 and 2 above provided that you accompany
|
||||
it with the complete corresponding machine-readable source code, which
|
||||
must be distributed under the terms of Sections 1 and 2 above on a
|
||||
medium customarily used for software interchange.
|
||||
|
||||
If distribution of object code is made by offering access to copy
|
||||
from a designated place, then offering equivalent access to copy the
|
||||
source code from the same place satisfies the requirement to
|
||||
distribute the source code, even though third parties are not
|
||||
compelled to copy the source along with the object code.
|
||||
|
||||
5. A program that contains no derivative of any portion of the
|
||||
Library, but is designed to work with the Library by being compiled or
|
||||
linked with it, is called a "work that uses the Library". Such a
|
||||
work, in isolation, is not a derivative work of the Library, and
|
||||
therefore falls outside the scope of this License.
|
||||
|
||||
However, linking a "work that uses the Library" with the Library
|
||||
creates an executable that is a derivative of the Library (because it
|
||||
contains portions of the Library), rather than a "work that uses the
|
||||
library". The executable is therefore covered by this License.
|
||||
Section 6 states terms for distribution of such executables.
|
||||
|
||||
When a "work that uses the Library" uses material from a header file
|
||||
that is part of the Library, the object code for the work may be a
|
||||
derivative work of the Library even though the source code is not.
|
||||
Whether this is true is especially significant if the work can be
|
||||
linked without the Library, or if the work is itself a library. The
|
||||
threshold for this to be true is not precisely defined by law.
|
||||
|
||||
If such an object file uses only numerical parameters, data
|
||||
structure layouts and accessors, and small macros and small inline
|
||||
functions (ten lines or less in length), then the use of the object
|
||||
file is unrestricted, regardless of whether it is legally a derivative
|
||||
work. (Executables containing this object code plus portions of the
|
||||
Library will still fall under Section 6.)
|
||||
|
||||
Otherwise, if the work is a derivative of the Library, you may
|
||||
distribute the object code for the work under the terms of Section 6.
|
||||
Any executables containing that work also fall under Section 6,
|
||||
whether or not they are linked directly with the Library itself.
|
||||
|
||||
6. As an exception to the Sections above, you may also combine or
|
||||
link a "work that uses the Library" with the Library to produce a
|
||||
work containing portions of the Library, and distribute that work
|
||||
under terms of your choice, provided that the terms permit
|
||||
modification of the work for the customer's own use and reverse
|
||||
engineering for debugging such modifications.
|
||||
|
||||
You must give prominent notice with each copy of the work that the
|
||||
Library is used in it and that the Library and its use are covered by
|
||||
this License. You must supply a copy of this License. If the work
|
||||
during execution displays copyright notices, you must include the
|
||||
copyright notice for the Library among them, as well as a reference
|
||||
directing the user to the copy of this License. Also, you must do one
|
||||
of these things:
|
||||
|
||||
a) Accompany the work with the complete corresponding
|
||||
machine-readable source code for the Library including whatever
|
||||
changes were used in the work (which must be distributed under
|
||||
Sections 1 and 2 above); and, if the work is an executable linked
|
||||
with the Library, with the complete machine-readable "work that
|
||||
uses the Library", as object code and/or source code, so that the
|
||||
user can modify the Library and then relink to produce a modified
|
||||
executable containing the modified Library. (It is understood
|
||||
that the user who changes the contents of definitions files in the
|
||||
Library will not necessarily be able to recompile the application
|
||||
to use the modified definitions.)
|
||||
|
||||
b) Use a suitable shared library mechanism for linking with the
|
||||
Library. A suitable mechanism is one that (1) uses at run time a
|
||||
copy of the library already present on the user's computer system,
|
||||
rather than copying library functions into the executable, and (2)
|
||||
will operate properly with a modified version of the library, if
|
||||
the user installs one, as long as the modified version is
|
||||
interface-compatible with the version that the work was made with.
|
||||
|
||||
c) Accompany the work with a written offer, valid for at
|
||||
least three years, to give the same user the materials
|
||||
specified in Subsection 6a, above, for a charge no more
|
||||
than the cost of performing this distribution.
|
||||
|
||||
d) If distribution of the work is made by offering access to copy
|
||||
from a designated place, offer equivalent access to copy the above
|
||||
specified materials from the same place.
|
||||
|
||||
e) Verify that the user has already received a copy of these
|
||||
materials or that you have already sent this user a copy.
|
||||
|
||||
For an executable, the required form of the "work that uses the
|
||||
Library" must include any data and utility programs needed for
|
||||
reproducing the executable from it. However, as a special exception,
|
||||
the materials to be distributed need not include anything that is
|
||||
normally distributed (in either source or binary form) with the major
|
||||
components (compiler, kernel, and so on) of the operating system on
|
||||
which the executable runs, unless that component itself accompanies
|
||||
the executable.
|
||||
|
||||
It may happen that this requirement contradicts the license
|
||||
restrictions of other proprietary libraries that do not normally
|
||||
accompany the operating system. Such a contradiction means you cannot
|
||||
use both them and the Library together in an executable that you
|
||||
distribute.
|
||||
|
||||
7. You may place library facilities that are a work based on the
|
||||
Library side-by-side in a single library together with other library
|
||||
facilities not covered by this License, and distribute such a combined
|
||||
library, provided that the separate distribution of the work based on
|
||||
the Library and of the other library facilities is otherwise
|
||||
permitted, and provided that you do these two things:
|
||||
|
||||
a) Accompany the combined library with a copy of the same work
|
||||
based on the Library, uncombined with any other library
|
||||
facilities. This must be distributed under the terms of the
|
||||
Sections above.
|
||||
|
||||
b) Give prominent notice with the combined library of the fact
|
||||
that part of it is a work based on the Library, and explaining
|
||||
where to find the accompanying uncombined form of the same work.
|
||||
|
||||
8. You may not copy, modify, sublicense, link with, or distribute
|
||||
the Library except as expressly provided under this License. Any
|
||||
attempt otherwise to copy, modify, sublicense, link with, or
|
||||
distribute the Library is void, and will automatically terminate your
|
||||
rights under this License. However, parties who have received copies,
|
||||
or rights, from you under this License will not have their licenses
|
||||
terminated so long as such parties remain in full compliance.
|
||||
|
||||
9. You are not required to accept this License, since you have not
|
||||
signed it. However, nothing else grants you permission to modify or
|
||||
distribute the Library or its derivative works. These actions are
|
||||
prohibited by law if you do not accept this License. Therefore, by
|
||||
modifying or distributing the Library (or any work based on the
|
||||
Library), you indicate your acceptance of this License to do so, and
|
||||
all its terms and conditions for copying, distributing or modifying
|
||||
the Library or works based on it.
|
||||
|
||||
10. Each time you redistribute the Library (or any work based on the
|
||||
Library), the recipient automatically receives a license from the
|
||||
original licensor to copy, distribute, link with or modify the Library
|
||||
subject to these terms and conditions. You may not impose any further
|
||||
restrictions on the recipients' exercise of the rights granted herein.
|
||||
You are not responsible for enforcing compliance by third parties with
|
||||
this License.
|
||||
|
||||
11. If, as a consequence of a court judgment or allegation of patent
|
||||
infringement or for any other reason (not limited to patent issues),
|
||||
conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot
|
||||
distribute so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you
|
||||
may not distribute the Library at all. For example, if a patent
|
||||
license would not permit royalty-free redistribution of the Library by
|
||||
all those who receive copies directly or indirectly through you, then
|
||||
the only way you could satisfy both it and this License would be to
|
||||
refrain entirely from distribution of the Library.
|
||||
|
||||
If any portion of this section is held invalid or unenforceable under any
|
||||
particular circumstance, the balance of the section is intended to apply,
|
||||
and the section as a whole is intended to apply in other circumstances.
|
||||
|
||||
It is not the purpose of this section to induce you to infringe any
|
||||
patents or other property right claims or to contest validity of any
|
||||
such claims; this section has the sole purpose of protecting the
|
||||
integrity of the free software distribution system which is
|
||||
implemented by public license practices. Many people have made
|
||||
generous contributions to the wide range of software distributed
|
||||
through that system in reliance on consistent application of that
|
||||
system; it is up to the author/donor to decide if he or she is willing
|
||||
to distribute software through any other system and a licensee cannot
|
||||
impose that choice.
|
||||
|
||||
This section is intended to make thoroughly clear what is believed to
|
||||
be a consequence of the rest of this License.
|
||||
|
||||
12. If the distribution and/or use of the Library is restricted in
|
||||
certain countries either by patents or by copyrighted interfaces, the
|
||||
original copyright holder who places the Library under this License may add
|
||||
an explicit geographical distribution limitation excluding those countries,
|
||||
so that distribution is permitted only in or among countries not thus
|
||||
excluded. In such case, this License incorporates the limitation as if
|
||||
written in the body of this License.
|
||||
|
||||
13. The Free Software Foundation may publish revised and/or new
|
||||
versions of the Lesser General Public License from time to time.
|
||||
Such new versions will be similar in spirit to the present version,
|
||||
but may differ in detail to address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the Library
|
||||
specifies a version number of this License which applies to it and
|
||||
"any later version", you have the option of following the terms and
|
||||
conditions either of that version or of any later version published by
|
||||
the Free Software Foundation. If the Library does not specify a
|
||||
license version number, you may choose any version ever published by
|
||||
the Free Software Foundation.
|
||||
|
||||
14. If you wish to incorporate parts of the Library into other free
|
||||
programs whose distribution conditions are incompatible with these,
|
||||
write to the author to ask for permission. For software which is
|
||||
copyrighted by the Free Software Foundation, write to the Free
|
||||
Software Foundation; we sometimes make exceptions for this. Our
|
||||
decision will be guided by the two goals of preserving the free status
|
||||
of all derivatives of our free software and of promoting the sharing
|
||||
and reuse of software generally.
|
||||
|
||||
NO WARRANTY
|
||||
|
||||
15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
|
||||
WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
|
||||
EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
|
||||
OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
|
||||
KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
|
||||
LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
|
||||
THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
|
||||
WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
|
||||
AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
|
||||
FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
|
||||
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
|
||||
LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
|
||||
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
|
||||
FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
|
||||
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
|
||||
DAMAGES.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Libraries
|
||||
|
||||
If you develop a new library, and you want it to be of the greatest
|
||||
possible use to the public, we recommend making it free software that
|
||||
everyone can redistribute and change. You can do so by permitting
|
||||
redistribution under these terms (or, alternatively, under the terms of the
|
||||
ordinary General Public License).
|
||||
|
||||
To apply these terms, attach the following notices to the library. It is
|
||||
safest to attach them to the start of each source file to most effectively
|
||||
convey the exclusion of warranty; and each file should have at least the
|
||||
"copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the library's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
You should also get your employer (if you work as a programmer) or your
|
||||
school, if any, to sign a "copyright disclaimer" for the library, if
|
||||
necessary. Here is a sample; alter the names:
|
||||
|
||||
Yoyodyne, Inc., hereby disclaims all copyright interest in the
|
||||
library `Frob' (a library for tweaking knobs) written by James Random Hacker.
|
||||
|
||||
<signature of Ty Coon>, 1 April 1990
|
||||
Ty Coon, President of Vice
|
||||
|
||||
That's all there is to it!
|
||||
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
include LICENSE
|
||||
include *.rst
|
||||
include requirements.txt
|
||||
include test.py
|
||||
recursive-include docs *
|
||||
recursive-include tests *
|
||||
global-exclude *.pyc
|
||||
global-exclude __pycache__
|
|
@ -1,140 +0,0 @@
|
|||
Class Hierarchy for chardet
|
||||
===========================
|
||||
|
||||
Universal Detector
|
||||
------------------
|
||||
Has a list of probers.
|
||||
|
||||
CharSetProber
|
||||
-------------
|
||||
Mostly abstract parent class.
|
||||
|
||||
CharSetGroupProber
|
||||
------------------
|
||||
Runs a bunch of related probers at the same time and decides which is best.
|
||||
|
||||
SBCSGroupProber
|
||||
---------------
|
||||
SBCS = Single-ByteCharSet. Runs a bunch of SingleByteCharSetProbers. Always
|
||||
contains the same SingleByteCharSetProbers.
|
||||
|
||||
SingleByteCharSetProber
|
||||
-----------------------
|
||||
A CharSetProber that is used for detecting single-byte encodings by using
|
||||
a "precedence matrix" (i.e., a character bigram model).
|
||||
|
||||
MBCSGroupProber
|
||||
---------------
|
||||
Runs a bunch of MultiByteCharSetProbers. It also uses a UTF8Prober, which is
|
||||
essentially a MultiByteCharSetProber that only has a state machine. Always
|
||||
contains the same MultiByteCharSetProbers.
|
||||
|
||||
MultiByteCharSetProber
|
||||
----------------------
|
||||
A CharSetProber that uses both a character unigram model (or "character
|
||||
distribution analysis") and an independent state machine for trying to
|
||||
detect and encoding.
|
||||
|
||||
CodingStateMachine
|
||||
------------------
|
||||
Used for "coding scheme" detection, where we just look for either invalid
|
||||
byte sequences or sequences that only occur for that particular encoding.
|
||||
|
||||
CharDistributionAnalysis
|
||||
------------------------
|
||||
Used for character unigram distribution encoding detection. Takes a mapping
|
||||
from characters to a "frequency order" (i.e., what frequency rank that byte has
|
||||
in the given encoding) and a "typical distribution ratio", which is the number
|
||||
of occurrences of the 512 most frequently used characters divided by the number
|
||||
of occurrences of the rest of the characters for a typical document.
|
||||
The "characters" in this case are 2-byte sequences and they are first converted
|
||||
to an "order" (name comes from ord() function, I believe). This "order" is used
|
||||
to index into the frequency order table to determine the frequency rank of that
|
||||
byte sequence. The reason this extra step is necessary is that the frequency
|
||||
rank table is language-specific (and not encoding-specific).
|
||||
|
||||
|
||||
What's where
|
||||
============
|
||||
|
||||
|
||||
Bigram files
|
||||
------------
|
||||
|
||||
- ``hebrewprober.py``
|
||||
- ``jpcntxprober.py``
|
||||
- ``langbulgarianmodel.py``
|
||||
- ``langcyrillicmodel.py``
|
||||
- ``langgreekmodel.py``
|
||||
- ``langhebrewmodel.py``
|
||||
- ``langhungarianmodel.py``
|
||||
- ``langthaimodel.py``
|
||||
- ``latin1prober.py``
|
||||
- ``sbcharsetprober.py``
|
||||
- ``sbcsgroupprober.py``
|
||||
|
||||
|
||||
Coding Scheme files
|
||||
-------------------
|
||||
|
||||
- ``escprober.py``
|
||||
- ``escsm.py``
|
||||
- ``utf8prober.py``
|
||||
- ``codingstatemachine.py``
|
||||
- ``mbcssmprober.py``
|
||||
|
||||
|
||||
Unigram files
|
||||
-------------
|
||||
|
||||
- ``big5freqprober.py``
|
||||
- ``chardistribution.py``
|
||||
- ``euckrfreqprober.py``
|
||||
- ``euctwfreqprober.py``
|
||||
- ``gb2312freqprober.py``
|
||||
- ``jisfreqprober.py``
|
||||
|
||||
Multibyte probers
|
||||
-----------------
|
||||
|
||||
- ``big5prober.py``
|
||||
- ``cp949prober.py``
|
||||
- ``eucjpprober.py``
|
||||
- ``euckrprober.py``
|
||||
- ``euctwprober.py``
|
||||
- ``gb2312prober.py``
|
||||
- ``mbcharsetprober.py``
|
||||
- ``mbcsgroupprober.py``
|
||||
- ``sjisprober.py``
|
||||
|
||||
Misc files
|
||||
----------
|
||||
|
||||
- ``__init__.py`` (currently has ``detect`` function in it)
|
||||
- ``compat.py``
|
||||
- ``enums.py``
|
||||
- ``universaldetector.py``
|
||||
- ``version.py``
|
||||
|
||||
|
||||
Useful links
|
||||
============
|
||||
|
||||
This is just a collection of information that I've found useful or thought
|
||||
might be useful in the future:
|
||||
|
||||
- `BOM by Encoding`_
|
||||
|
||||
- `A Composite Approach to Language/Encoding Detection`_
|
||||
|
||||
- `What Every Programmer Absolutely...`_
|
||||
|
||||
- The actual `source`_
|
||||
|
||||
|
||||
.. _BOM by Encoding:
|
||||
https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding
|
||||
.. _A Composite Approach to Language/Encoding Detection:
|
||||
http://www-archive.mozilla.org/projects/intl/UniversalCharsetDetection.html
|
||||
.. _What Every Programmer Absolutely...: http://kunststube.net/encoding/
|
||||
.. _source: https://dxr.mozilla.org/mozilla/source/intl/chardet/
|
|
@ -1,99 +0,0 @@
|
|||
Metadata-Version: 1.2
|
||||
Name: chardet
|
||||
Version: 4.0.0
|
||||
Summary: Universal encoding detector for Python 2 and 3
|
||||
Home-page: https://github.com/chardet/chardet
|
||||
Author: Mark Pilgrim
|
||||
Author-email: mark@diveintomark.org
|
||||
Maintainer: Daniel Blanchard
|
||||
Maintainer-email: dan.blanchard@gmail.com
|
||||
License: LGPL
|
||||
Description: Chardet: The Universal Character Encoding Detector
|
||||
--------------------------------------------------
|
||||
|
||||
.. image:: https://img.shields.io/travis/chardet/chardet/stable.svg
|
||||
:alt: Build status
|
||||
:target: https://travis-ci.org/chardet/chardet
|
||||
|
||||
.. image:: https://img.shields.io/coveralls/chardet/chardet/stable.svg
|
||||
:target: https://coveralls.io/r/chardet/chardet
|
||||
|
||||
.. image:: https://img.shields.io/pypi/v/chardet.svg
|
||||
:target: https://warehouse.python.org/project/chardet/
|
||||
:alt: Latest version on PyPI
|
||||
|
||||
.. image:: https://img.shields.io/pypi/l/chardet.svg
|
||||
:alt: License
|
||||
|
||||
|
||||
Detects
|
||||
- ASCII, UTF-8, UTF-16 (2 variants), UTF-32 (4 variants)
|
||||
- Big5, GB2312, EUC-TW, HZ-GB-2312, ISO-2022-CN (Traditional and Simplified Chinese)
|
||||
- EUC-JP, SHIFT_JIS, CP932, ISO-2022-JP (Japanese)
|
||||
- EUC-KR, ISO-2022-KR (Korean)
|
||||
- KOI8-R, MacCyrillic, IBM855, IBM866, ISO-8859-5, windows-1251 (Cyrillic)
|
||||
- ISO-8859-5, windows-1251 (Bulgarian)
|
||||
- ISO-8859-1, windows-1252 (Western European languages)
|
||||
- ISO-8859-7, windows-1253 (Greek)
|
||||
- ISO-8859-8, windows-1255 (Visual and Logical Hebrew)
|
||||
- TIS-620 (Thai)
|
||||
|
||||
.. note::
|
||||
Our ISO-8859-2 and windows-1250 (Hungarian) probers have been temporarily
|
||||
disabled until we can retrain the models.
|
||||
|
||||
Requires Python 2.7 or 3.5+.
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
Install from `PyPI <https://pypi.org/project/chardet/>`_::
|
||||
|
||||
pip install chardet
|
||||
|
||||
Documentation
|
||||
-------------
|
||||
|
||||
For users, docs are now available at https://chardet.readthedocs.io/.
|
||||
|
||||
Command-line Tool
|
||||
-----------------
|
||||
|
||||
chardet comes with a command-line script which reports on the encodings of one
|
||||
or more files::
|
||||
|
||||
% chardetect somefile someotherfile
|
||||
somefile: windows-1252 with confidence 0.5
|
||||
someotherfile: ascii with confidence 1.0
|
||||
|
||||
About
|
||||
-----
|
||||
|
||||
This is a continuation of Mark Pilgrim's excellent chardet. Previously, two
|
||||
versions needed to be maintained: one that supported python 2.x and one that
|
||||
supported python 3.x. We've recently merged with `Ian Cordasco <https://github.com/sigmavirus24>`_'s
|
||||
`charade <https://github.com/sigmavirus24/charade>`_ fork, so now we have one
|
||||
coherent version that works for Python 2.7+ and 3.4+.
|
||||
|
||||
:maintainer: Dan Blanchard
|
||||
|
||||
Keywords: encoding,i18n,xml
|
||||
Platform: UNKNOWN
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 2
|
||||
Classifier: Programming Language :: Python :: 2.7
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.5
|
||||
Classifier: Programming Language :: Python :: 3.6
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: Implementation :: CPython
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Topic :: Text Processing :: Linguistic
|
||||
Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*
|
|
@ -1,68 +0,0 @@
|
|||
Chardet: The Universal Character Encoding Detector
|
||||
--------------------------------------------------
|
||||
|
||||
.. image:: https://img.shields.io/travis/chardet/chardet/stable.svg
|
||||
:alt: Build status
|
||||
:target: https://travis-ci.org/chardet/chardet
|
||||
|
||||
.. image:: https://img.shields.io/coveralls/chardet/chardet/stable.svg
|
||||
:target: https://coveralls.io/r/chardet/chardet
|
||||
|
||||
.. image:: https://img.shields.io/pypi/v/chardet.svg
|
||||
:target: https://warehouse.python.org/project/chardet/
|
||||
:alt: Latest version on PyPI
|
||||
|
||||
.. image:: https://img.shields.io/pypi/l/chardet.svg
|
||||
:alt: License
|
||||
|
||||
|
||||
Detects
|
||||
- ASCII, UTF-8, UTF-16 (2 variants), UTF-32 (4 variants)
|
||||
- Big5, GB2312, EUC-TW, HZ-GB-2312, ISO-2022-CN (Traditional and Simplified Chinese)
|
||||
- EUC-JP, SHIFT_JIS, CP932, ISO-2022-JP (Japanese)
|
||||
- EUC-KR, ISO-2022-KR (Korean)
|
||||
- KOI8-R, MacCyrillic, IBM855, IBM866, ISO-8859-5, windows-1251 (Cyrillic)
|
||||
- ISO-8859-5, windows-1251 (Bulgarian)
|
||||
- ISO-8859-1, windows-1252 (Western European languages)
|
||||
- ISO-8859-7, windows-1253 (Greek)
|
||||
- ISO-8859-8, windows-1255 (Visual and Logical Hebrew)
|
||||
- TIS-620 (Thai)
|
||||
|
||||
.. note::
|
||||
Our ISO-8859-2 and windows-1250 (Hungarian) probers have been temporarily
|
||||
disabled until we can retrain the models.
|
||||
|
||||
Requires Python 2.7 or 3.5+.
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
Install from `PyPI <https://pypi.org/project/chardet/>`_::
|
||||
|
||||
pip install chardet
|
||||
|
||||
Documentation
|
||||
-------------
|
||||
|
||||
For users, docs are now available at https://chardet.readthedocs.io/.
|
||||
|
||||
Command-line Tool
|
||||
-----------------
|
||||
|
||||
chardet comes with a command-line script which reports on the encodings of one
|
||||
or more files::
|
||||
|
||||
% chardetect somefile someotherfile
|
||||
somefile: windows-1252 with confidence 0.5
|
||||
someotherfile: ascii with confidence 1.0
|
||||
|
||||
About
|
||||
-----
|
||||
|
||||
This is a continuation of Mark Pilgrim's excellent chardet. Previously, two
|
||||
versions needed to be maintained: one that supported python 2.x and one that
|
||||
supported python 3.x. We've recently merged with `Ian Cordasco <https://github.com/sigmavirus24>`_'s
|
||||
`charade <https://github.com/sigmavirus24/charade>`_ fork, so now we have one
|
||||
coherent version that works for Python 2.7+ and 3.4+.
|
||||
|
||||
:maintainer: Dan Blanchard
|
|
@ -1,83 +0,0 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
|
||||
from .universaldetector import UniversalDetector
|
||||
from .enums import InputState
|
||||
from .version import __version__, VERSION
|
||||
|
||||
|
||||
__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION']
|
||||
|
||||
|
||||
def detect(byte_str):
|
||||
"""
|
||||
Detect the encoding of the given byte string.
|
||||
|
||||
:param byte_str: The byte sequence to examine.
|
||||
:type byte_str: ``bytes`` or ``bytearray``
|
||||
"""
|
||||
if not isinstance(byte_str, bytearray):
|
||||
if not isinstance(byte_str, bytes):
|
||||
raise TypeError('Expected object of type bytes or bytearray, got: '
|
||||
'{}'.format(type(byte_str)))
|
||||
else:
|
||||
byte_str = bytearray(byte_str)
|
||||
detector = UniversalDetector()
|
||||
detector.feed(byte_str)
|
||||
return detector.close()
|
||||
|
||||
|
||||
def detect_all(byte_str):
|
||||
"""
|
||||
Detect all the possible encodings of the given byte string.
|
||||
|
||||
:param byte_str: The byte sequence to examine.
|
||||
:type byte_str: ``bytes`` or ``bytearray``
|
||||
"""
|
||||
if not isinstance(byte_str, bytearray):
|
||||
if not isinstance(byte_str, bytes):
|
||||
raise TypeError('Expected object of type bytes or bytearray, got: '
|
||||
'{}'.format(type(byte_str)))
|
||||
else:
|
||||
byte_str = bytearray(byte_str)
|
||||
|
||||
detector = UniversalDetector()
|
||||
detector.feed(byte_str)
|
||||
detector.close()
|
||||
|
||||
if detector._input_state == InputState.HIGH_BYTE:
|
||||
results = []
|
||||
for prober in detector._charset_probers:
|
||||
if prober.get_confidence() > detector.MINIMUM_THRESHOLD:
|
||||
charset_name = prober.charset_name
|
||||
lower_charset_name = prober.charset_name.lower()
|
||||
# Use Windows encoding name instead of ISO-8859 if we saw any
|
||||
# extra Windows-specific bytes
|
||||
if lower_charset_name.startswith('iso-8859'):
|
||||
if detector._has_win_bytes:
|
||||
charset_name = detector.ISO_WIN_MAP.get(lower_charset_name,
|
||||
charset_name)
|
||||
results.append({
|
||||
'encoding': charset_name,
|
||||
'confidence': prober.get_confidence(),
|
||||
'language': prober.language,
|
||||
})
|
||||
if len(results) > 0:
|
||||
return sorted(results, key=lambda result: -result['confidence'])
|
||||
|
||||
return [detector.result]
|
|
@ -1,386 +0,0 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is Mozilla Communicator client code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 1998
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
# Big5 frequency table
|
||||
# by Taiwan's Mandarin Promotion Council
|
||||
# <http://www.edu.tw:81/mandr/>
|
||||
#
|
||||
# 128 --> 0.42261
|
||||
# 256 --> 0.57851
|
||||
# 512 --> 0.74851
|
||||
# 1024 --> 0.89384
|
||||
# 2048 --> 0.97583
|
||||
#
|
||||
# Ideal Distribution Ratio = 0.74851/(1-0.74851) =2.98
|
||||
# Random Distribution Ration = 512/(5401-512)=0.105
|
||||
#
|
||||
# Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR
|
||||
|
||||
BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
||||
|
||||
#Char to FreqOrder table
|
||||
BIG5_TABLE_SIZE = 5376
|
||||
|
||||
BIG5_CHAR_TO_FREQ_ORDER = (
|
||||
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
|
||||
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
|
||||
1198,3972,3350,4202, 410,2215, 302, 590, 361,1964, 8, 204, 58,4510,5009,1932, # 48
|
||||
63,5010,5011, 317,1614, 75, 222, 159,4203,2417,1480,5012,3555,3091, 224,2822, # 64
|
||||
3682, 3, 10,3973,1471, 29,2787,1135,2866,1940, 873, 130,3275,1123, 312,5013, # 80
|
||||
4511,2052, 507, 252, 682,5014, 142,1915, 124, 206,2947, 34,3556,3204, 64, 604, # 96
|
||||
5015,2501,1977,1978, 155,1991, 645, 641,1606,5016,3452, 337, 72, 406,5017, 80, # 112
|
||||
630, 238,3205,1509, 263, 939,1092,2654, 756,1440,1094,3453, 449, 69,2987, 591, # 128
|
||||
179,2096, 471, 115,2035,1844, 60, 50,2988, 134, 806,1869, 734,2036,3454, 180, # 144
|
||||
995,1607, 156, 537,2907, 688,5018, 319,1305, 779,2145, 514,2379, 298,4512, 359, # 160
|
||||
2502, 90,2716,1338, 663, 11, 906,1099,2553, 20,2441, 182, 532,1716,5019, 732, # 176
|
||||
1376,4204,1311,1420,3206, 25,2317,1056, 113, 399, 382,1950, 242,3455,2474, 529, # 192
|
||||
3276, 475,1447,3683,5020, 117, 21, 656, 810,1297,2300,2334,3557,5021, 126,4205, # 208
|
||||
706, 456, 150, 613,4513, 71,1118,2037,4206, 145,3092, 85, 835, 486,2115,1246, # 224
|
||||
1426, 428, 727,1285,1015, 800, 106, 623, 303,1281,5022,2128,2359, 347,3815, 221, # 240
|
||||
3558,3135,5023,1956,1153,4207, 83, 296,1199,3093, 192, 624, 93,5024, 822,1898, # 256
|
||||
2823,3136, 795,2065, 991,1554,1542,1592, 27, 43,2867, 859, 139,1456, 860,4514, # 272
|
||||
437, 712,3974, 164,2397,3137, 695, 211,3037,2097, 195,3975,1608,3559,3560,3684, # 288
|
||||
3976, 234, 811,2989,2098,3977,2233,1441,3561,1615,2380, 668,2077,1638, 305, 228, # 304
|
||||
1664,4515, 467, 415,5025, 262,2099,1593, 239, 108, 300, 200,1033, 512,1247,2078, # 320
|
||||
5026,5027,2176,3207,3685,2682, 593, 845,1062,3277, 88,1723,2038,3978,1951, 212, # 336
|
||||
266, 152, 149, 468,1899,4208,4516, 77, 187,5028,3038, 37, 5,2990,5029,3979, # 352
|
||||
5030,5031, 39,2524,4517,2908,3208,2079, 55, 148, 74,4518, 545, 483,1474,1029, # 368
|
||||
1665, 217,1870,1531,3138,1104,2655,4209, 24, 172,3562, 900,3980,3563,3564,4519, # 384
|
||||
32,1408,2824,1312, 329, 487,2360,2251,2717, 784,2683, 4,3039,3351,1427,1789, # 400
|
||||
188, 109, 499,5032,3686,1717,1790, 888,1217,3040,4520,5033,3565,5034,3352,1520, # 416
|
||||
3687,3981, 196,1034, 775,5035,5036, 929,1816, 249, 439, 38,5037,1063,5038, 794, # 432
|
||||
3982,1435,2301, 46, 178,3278,2066,5039,2381,5040, 214,1709,4521, 804, 35, 707, # 448
|
||||
324,3688,1601,2554, 140, 459,4210,5041,5042,1365, 839, 272, 978,2262,2580,3456, # 464
|
||||
2129,1363,3689,1423, 697, 100,3094, 48, 70,1231, 495,3139,2196,5043,1294,5044, # 480
|
||||
2080, 462, 586,1042,3279, 853, 256, 988, 185,2382,3457,1698, 434,1084,5045,3458, # 496
|
||||
314,2625,2788,4522,2335,2336, 569,2285, 637,1817,2525, 757,1162,1879,1616,3459, # 512
|
||||
287,1577,2116, 768,4523,1671,2868,3566,2526,1321,3816, 909,2418,5046,4211, 933, # 528
|
||||
3817,4212,2053,2361,1222,4524, 765,2419,1322, 786,4525,5047,1920,1462,1677,2909, # 544
|
||||
1699,5048,4526,1424,2442,3140,3690,2600,3353,1775,1941,3460,3983,4213, 309,1369, # 560
|
||||
1130,2825, 364,2234,1653,1299,3984,3567,3985,3986,2656, 525,1085,3041, 902,2001, # 576
|
||||
1475, 964,4527, 421,1845,1415,1057,2286, 940,1364,3141, 376,4528,4529,1381, 7, # 592
|
||||
2527, 983,2383, 336,1710,2684,1846, 321,3461, 559,1131,3042,2752,1809,1132,1313, # 608
|
||||
265,1481,1858,5049, 352,1203,2826,3280, 167,1089, 420,2827, 776, 792,1724,3568, # 624
|
||||
4214,2443,3281,5050,4215,5051, 446, 229, 333,2753, 901,3818,1200,1557,4530,2657, # 640
|
||||
1921, 395,2754,2685,3819,4216,1836, 125, 916,3209,2626,4531,5052,5053,3820,5054, # 656
|
||||
5055,5056,4532,3142,3691,1133,2555,1757,3462,1510,2318,1409,3569,5057,2146, 438, # 672
|
||||
2601,2910,2384,3354,1068, 958,3043, 461, 311,2869,2686,4217,1916,3210,4218,1979, # 688
|
||||
383, 750,2755,2627,4219, 274, 539, 385,1278,1442,5058,1154,1965, 384, 561, 210, # 704
|
||||
98,1295,2556,3570,5059,1711,2420,1482,3463,3987,2911,1257, 129,5060,3821, 642, # 720
|
||||
523,2789,2790,2658,5061, 141,2235,1333, 68, 176, 441, 876, 907,4220, 603,2602, # 736
|
||||
710, 171,3464, 404, 549, 18,3143,2398,1410,3692,1666,5062,3571,4533,2912,4534, # 752
|
||||
5063,2991, 368,5064, 146, 366, 99, 871,3693,1543, 748, 807,1586,1185, 22,2263, # 768
|
||||
379,3822,3211,5065,3212, 505,1942,2628,1992,1382,2319,5066, 380,2362, 218, 702, # 784
|
||||
1818,1248,3465,3044,3572,3355,3282,5067,2992,3694, 930,3283,3823,5068, 59,5069, # 800
|
||||
585, 601,4221, 497,3466,1112,1314,4535,1802,5070,1223,1472,2177,5071, 749,1837, # 816
|
||||
690,1900,3824,1773,3988,1476, 429,1043,1791,2236,2117, 917,4222, 447,1086,1629, # 832
|
||||
5072, 556,5073,5074,2021,1654, 844,1090, 105, 550, 966,1758,2828,1008,1783, 686, # 848
|
||||
1095,5075,2287, 793,1602,5076,3573,2603,4536,4223,2948,2302,4537,3825, 980,2503, # 864
|
||||
544, 353, 527,4538, 908,2687,2913,5077, 381,2629,1943,1348,5078,1341,1252, 560, # 880
|
||||
3095,5079,3467,2870,5080,2054, 973, 886,2081, 143,4539,5081,5082, 157,3989, 496, # 896
|
||||
4224, 57, 840, 540,2039,4540,4541,3468,2118,1445, 970,2264,1748,1966,2082,4225, # 912
|
||||
3144,1234,1776,3284,2829,3695, 773,1206,2130,1066,2040,1326,3990,1738,1725,4226, # 928
|
||||
279,3145, 51,1544,2604, 423,1578,2131,2067, 173,4542,1880,5083,5084,1583, 264, # 944
|
||||
610,3696,4543,2444, 280, 154,5085,5086,5087,1739, 338,1282,3096, 693,2871,1411, # 960
|
||||
1074,3826,2445,5088,4544,5089,5090,1240, 952,2399,5091,2914,1538,2688, 685,1483, # 976
|
||||
4227,2475,1436, 953,4228,2055,4545, 671,2400, 79,4229,2446,3285, 608, 567,2689, # 992
|
||||
3469,4230,4231,1691, 393,1261,1792,2401,5092,4546,5093,5094,5095,5096,1383,1672, # 1008
|
||||
3827,3213,1464, 522,1119, 661,1150, 216, 675,4547,3991,1432,3574, 609,4548,2690, # 1024
|
||||
2402,5097,5098,5099,4232,3045, 0,5100,2476, 315, 231,2447, 301,3356,4549,2385, # 1040
|
||||
5101, 233,4233,3697,1819,4550,4551,5102, 96,1777,1315,2083,5103, 257,5104,1810, # 1056
|
||||
3698,2718,1139,1820,4234,2022,1124,2164,2791,1778,2659,5105,3097, 363,1655,3214, # 1072
|
||||
5106,2993,5107,5108,5109,3992,1567,3993, 718, 103,3215, 849,1443, 341,3357,2949, # 1088
|
||||
1484,5110,1712, 127, 67, 339,4235,2403, 679,1412, 821,5111,5112, 834, 738, 351, # 1104
|
||||
2994,2147, 846, 235,1497,1881, 418,1993,3828,2719, 186,1100,2148,2756,3575,1545, # 1120
|
||||
1355,2950,2872,1377, 583,3994,4236,2581,2995,5113,1298,3699,1078,2557,3700,2363, # 1136
|
||||
78,3829,3830, 267,1289,2100,2002,1594,4237, 348, 369,1274,2197,2178,1838,4552, # 1152
|
||||
1821,2830,3701,2757,2288,2003,4553,2951,2758, 144,3358, 882,4554,3995,2759,3470, # 1168
|
||||
4555,2915,5114,4238,1726, 320,5115,3996,3046, 788,2996,5116,2831,1774,1327,2873, # 1184
|
||||
3997,2832,5117,1306,4556,2004,1700,3831,3576,2364,2660, 787,2023, 506, 824,3702, # 1200
|
||||
534, 323,4557,1044,3359,2024,1901, 946,3471,5118,1779,1500,1678,5119,1882,4558, # 1216
|
||||
165, 243,4559,3703,2528, 123, 683,4239, 764,4560, 36,3998,1793, 589,2916, 816, # 1232
|
||||
626,1667,3047,2237,1639,1555,1622,3832,3999,5120,4000,2874,1370,1228,1933, 891, # 1248
|
||||
2084,2917, 304,4240,5121, 292,2997,2720,3577, 691,2101,4241,1115,4561, 118, 662, # 1264
|
||||
5122, 611,1156, 854,2386,1316,2875, 2, 386, 515,2918,5123,5124,3286, 868,2238, # 1280
|
||||
1486, 855,2661, 785,2216,3048,5125,1040,3216,3578,5126,3146, 448,5127,1525,5128, # 1296
|
||||
2165,4562,5129,3833,5130,4242,2833,3579,3147, 503, 818,4001,3148,1568, 814, 676, # 1312
|
||||
1444, 306,1749,5131,3834,1416,1030, 197,1428, 805,2834,1501,4563,5132,5133,5134, # 1328
|
||||
1994,5135,4564,5136,5137,2198, 13,2792,3704,2998,3149,1229,1917,5138,3835,2132, # 1344
|
||||
5139,4243,4565,2404,3580,5140,2217,1511,1727,1120,5141,5142, 646,3836,2448, 307, # 1360
|
||||
5143,5144,1595,3217,5145,5146,5147,3705,1113,1356,4002,1465,2529,2530,5148, 519, # 1376
|
||||
5149, 128,2133, 92,2289,1980,5150,4003,1512, 342,3150,2199,5151,2793,2218,1981, # 1392
|
||||
3360,4244, 290,1656,1317, 789, 827,2365,5152,3837,4566, 562, 581,4004,5153, 401, # 1408
|
||||
4567,2252, 94,4568,5154,1399,2794,5155,1463,2025,4569,3218,1944,5156, 828,1105, # 1424
|
||||
4245,1262,1394,5157,4246, 605,4570,5158,1784,2876,5159,2835, 819,2102, 578,2200, # 1440
|
||||
2952,5160,1502, 436,3287,4247,3288,2836,4005,2919,3472,3473,5161,2721,2320,5162, # 1456
|
||||
5163,2337,2068, 23,4571, 193, 826,3838,2103, 699,1630,4248,3098, 390,1794,1064, # 1472
|
||||
3581,5164,1579,3099,3100,1400,5165,4249,1839,1640,2877,5166,4572,4573, 137,4250, # 1488
|
||||
598,3101,1967, 780, 104, 974,2953,5167, 278, 899, 253, 402, 572, 504, 493,1339, # 1504
|
||||
5168,4006,1275,4574,2582,2558,5169,3706,3049,3102,2253, 565,1334,2722, 863, 41, # 1520
|
||||
5170,5171,4575,5172,1657,2338, 19, 463,2760,4251, 606,5173,2999,3289,1087,2085, # 1536
|
||||
1323,2662,3000,5174,1631,1623,1750,4252,2691,5175,2878, 791,2723,2663,2339, 232, # 1552
|
||||
2421,5176,3001,1498,5177,2664,2630, 755,1366,3707,3290,3151,2026,1609, 119,1918, # 1568
|
||||
3474, 862,1026,4253,5178,4007,3839,4576,4008,4577,2265,1952,2477,5179,1125, 817, # 1584
|
||||
4254,4255,4009,1513,1766,2041,1487,4256,3050,3291,2837,3840,3152,5180,5181,1507, # 1600
|
||||
5182,2692, 733, 40,1632,1106,2879, 345,4257, 841,2531, 230,4578,3002,1847,3292, # 1616
|
||||
3475,5183,1263, 986,3476,5184, 735, 879, 254,1137, 857, 622,1300,1180,1388,1562, # 1632
|
||||
4010,4011,2954, 967,2761,2665,1349, 592,2134,1692,3361,3003,1995,4258,1679,4012, # 1648
|
||||
1902,2188,5185, 739,3708,2724,1296,1290,5186,4259,2201,2202,1922,1563,2605,2559, # 1664
|
||||
1871,2762,3004,5187, 435,5188, 343,1108, 596, 17,1751,4579,2239,3477,3709,5189, # 1680
|
||||
4580, 294,3582,2955,1693, 477, 979, 281,2042,3583, 643,2043,3710,2631,2795,2266, # 1696
|
||||
1031,2340,2135,2303,3584,4581, 367,1249,2560,5190,3585,5191,4582,1283,3362,2005, # 1712
|
||||
240,1762,3363,4583,4584, 836,1069,3153, 474,5192,2149,2532, 268,3586,5193,3219, # 1728
|
||||
1521,1284,5194,1658,1546,4260,5195,3587,3588,5196,4261,3364,2693,1685,4262, 961, # 1744
|
||||
1673,2632, 190,2006,2203,3841,4585,4586,5197, 570,2504,3711,1490,5198,4587,2633, # 1760
|
||||
3293,1957,4588, 584,1514, 396,1045,1945,5199,4589,1968,2449,5200,5201,4590,4013, # 1776
|
||||
619,5202,3154,3294, 215,2007,2796,2561,3220,4591,3221,4592, 763,4263,3842,4593, # 1792
|
||||
5203,5204,1958,1767,2956,3365,3712,1174, 452,1477,4594,3366,3155,5205,2838,1253, # 1808
|
||||
2387,2189,1091,2290,4264, 492,5206, 638,1169,1825,2136,1752,4014, 648, 926,1021, # 1824
|
||||
1324,4595, 520,4596, 997, 847,1007, 892,4597,3843,2267,1872,3713,2405,1785,4598, # 1840
|
||||
1953,2957,3103,3222,1728,4265,2044,3714,4599,2008,1701,3156,1551, 30,2268,4266, # 1856
|
||||
5207,2027,4600,3589,5208, 501,5209,4267, 594,3478,2166,1822,3590,3479,3591,3223, # 1872
|
||||
829,2839,4268,5210,1680,3157,1225,4269,5211,3295,4601,4270,3158,2341,5212,4602, # 1888
|
||||
4271,5213,4015,4016,5214,1848,2388,2606,3367,5215,4603, 374,4017, 652,4272,4273, # 1904
|
||||
375,1140, 798,5216,5217,5218,2366,4604,2269, 546,1659, 138,3051,2450,4605,5219, # 1920
|
||||
2254, 612,1849, 910, 796,3844,1740,1371, 825,3845,3846,5220,2920,2562,5221, 692, # 1936
|
||||
444,3052,2634, 801,4606,4274,5222,1491, 244,1053,3053,4275,4276, 340,5223,4018, # 1952
|
||||
1041,3005, 293,1168, 87,1357,5224,1539, 959,5225,2240, 721, 694,4277,3847, 219, # 1968
|
||||
1478, 644,1417,3368,2666,1413,1401,1335,1389,4019,5226,5227,3006,2367,3159,1826, # 1984
|
||||
730,1515, 184,2840, 66,4607,5228,1660,2958, 246,3369, 378,1457, 226,3480, 975, # 2000
|
||||
4020,2959,1264,3592, 674, 696,5229, 163,5230,1141,2422,2167, 713,3593,3370,4608, # 2016
|
||||
4021,5231,5232,1186, 15,5233,1079,1070,5234,1522,3224,3594, 276,1050,2725, 758, # 2032
|
||||
1126, 653,2960,3296,5235,2342, 889,3595,4022,3104,3007, 903,1250,4609,4023,3481, # 2048
|
||||
3596,1342,1681,1718, 766,3297, 286, 89,2961,3715,5236,1713,5237,2607,3371,3008, # 2064
|
||||
5238,2962,2219,3225,2880,5239,4610,2505,2533, 181, 387,1075,4024, 731,2190,3372, # 2080
|
||||
5240,3298, 310, 313,3482,2304, 770,4278, 54,3054, 189,4611,3105,3848,4025,5241, # 2096
|
||||
1230,1617,1850, 355,3597,4279,4612,3373, 111,4280,3716,1350,3160,3483,3055,4281, # 2112
|
||||
2150,3299,3598,5242,2797,4026,4027,3009, 722,2009,5243,1071, 247,1207,2343,2478, # 2128
|
||||
1378,4613,2010, 864,1437,1214,4614, 373,3849,1142,2220, 667,4615, 442,2763,2563, # 2144
|
||||
3850,4028,1969,4282,3300,1840, 837, 170,1107, 934,1336,1883,5244,5245,2119,4283, # 2160
|
||||
2841, 743,1569,5246,4616,4284, 582,2389,1418,3484,5247,1803,5248, 357,1395,1729, # 2176
|
||||
3717,3301,2423,1564,2241,5249,3106,3851,1633,4617,1114,2086,4285,1532,5250, 482, # 2192
|
||||
2451,4618,5251,5252,1492, 833,1466,5253,2726,3599,1641,2842,5254,1526,1272,3718, # 2208
|
||||
4286,1686,1795, 416,2564,1903,1954,1804,5255,3852,2798,3853,1159,2321,5256,2881, # 2224
|
||||
4619,1610,1584,3056,2424,2764, 443,3302,1163,3161,5257,5258,4029,5259,4287,2506, # 2240
|
||||
3057,4620,4030,3162,2104,1647,3600,2011,1873,4288,5260,4289, 431,3485,5261, 250, # 2256
|
||||
97, 81,4290,5262,1648,1851,1558, 160, 848,5263, 866, 740,1694,5264,2204,2843, # 2272
|
||||
3226,4291,4621,3719,1687, 950,2479, 426, 469,3227,3720,3721,4031,5265,5266,1188, # 2288
|
||||
424,1996, 861,3601,4292,3854,2205,2694, 168,1235,3602,4293,5267,2087,1674,4622, # 2304
|
||||
3374,3303, 220,2565,1009,5268,3855, 670,3010, 332,1208, 717,5269,5270,3603,2452, # 2320
|
||||
4032,3375,5271, 513,5272,1209,2882,3376,3163,4623,1080,5273,5274,5275,5276,2534, # 2336
|
||||
3722,3604, 815,1587,4033,4034,5277,3605,3486,3856,1254,4624,1328,3058,1390,4035, # 2352
|
||||
1741,4036,3857,4037,5278, 236,3858,2453,3304,5279,5280,3723,3859,1273,3860,4625, # 2368
|
||||
5281, 308,5282,4626, 245,4627,1852,2480,1307,2583, 430, 715,2137,2454,5283, 270, # 2384
|
||||
199,2883,4038,5284,3606,2727,1753, 761,1754, 725,1661,1841,4628,3487,3724,5285, # 2400
|
||||
5286, 587, 14,3305, 227,2608, 326, 480,2270, 943,2765,3607, 291, 650,1884,5287, # 2416
|
||||
1702,1226, 102,1547, 62,3488, 904,4629,3489,1164,4294,5288,5289,1224,1548,2766, # 2432
|
||||
391, 498,1493,5290,1386,1419,5291,2056,1177,4630, 813, 880,1081,2368, 566,1145, # 2448
|
||||
4631,2291,1001,1035,2566,2609,2242, 394,1286,5292,5293,2069,5294, 86,1494,1730, # 2464
|
||||
4039, 491,1588, 745, 897,2963, 843,3377,4040,2767,2884,3306,1768, 998,2221,2070, # 2480
|
||||
397,1827,1195,1970,3725,3011,3378, 284,5295,3861,2507,2138,2120,1904,5296,4041, # 2496
|
||||
2151,4042,4295,1036,3490,1905, 114,2567,4296, 209,1527,5297,5298,2964,2844,2635, # 2512
|
||||
2390,2728,3164, 812,2568,5299,3307,5300,1559, 737,1885,3726,1210, 885, 28,2695, # 2528
|
||||
3608,3862,5301,4297,1004,1780,4632,5302, 346,1982,2222,2696,4633,3863,1742, 797, # 2544
|
||||
1642,4043,1934,1072,1384,2152, 896,4044,3308,3727,3228,2885,3609,5303,2569,1959, # 2560
|
||||
4634,2455,1786,5304,5305,5306,4045,4298,1005,1308,3728,4299,2729,4635,4636,1528, # 2576
|
||||
2610, 161,1178,4300,1983, 987,4637,1101,4301, 631,4046,1157,3229,2425,1343,1241, # 2592
|
||||
1016,2243,2570, 372, 877,2344,2508,1160, 555,1935, 911,4047,5307, 466,1170, 169, # 2608
|
||||
1051,2921,2697,3729,2481,3012,1182,2012,2571,1251,2636,5308, 992,2345,3491,1540, # 2624
|
||||
2730,1201,2071,2406,1997,2482,5309,4638, 528,1923,2191,1503,1874,1570,2369,3379, # 2640
|
||||
3309,5310, 557,1073,5311,1828,3492,2088,2271,3165,3059,3107, 767,3108,2799,4639, # 2656
|
||||
1006,4302,4640,2346,1267,2179,3730,3230, 778,4048,3231,2731,1597,2667,5312,4641, # 2672
|
||||
5313,3493,5314,5315,5316,3310,2698,1433,3311, 131, 95,1504,4049, 723,4303,3166, # 2688
|
||||
1842,3610,2768,2192,4050,2028,2105,3731,5317,3013,4051,1218,5318,3380,3232,4052, # 2704
|
||||
4304,2584, 248,1634,3864, 912,5319,2845,3732,3060,3865, 654, 53,5320,3014,5321, # 2720
|
||||
1688,4642, 777,3494,1032,4053,1425,5322, 191, 820,2121,2846, 971,4643, 931,3233, # 2736
|
||||
135, 664, 783,3866,1998, 772,2922,1936,4054,3867,4644,2923,3234, 282,2732, 640, # 2752
|
||||
1372,3495,1127, 922, 325,3381,5323,5324, 711,2045,5325,5326,4055,2223,2800,1937, # 2768
|
||||
4056,3382,2224,2255,3868,2305,5327,4645,3869,1258,3312,4057,3235,2139,2965,4058, # 2784
|
||||
4059,5328,2225, 258,3236,4646, 101,1227,5329,3313,1755,5330,1391,3314,5331,2924, # 2800
|
||||
2057, 893,5332,5333,5334,1402,4305,2347,5335,5336,3237,3611,5337,5338, 878,1325, # 2816
|
||||
1781,2801,4647, 259,1385,2585, 744,1183,2272,4648,5339,4060,2509,5340, 684,1024, # 2832
|
||||
4306,5341, 472,3612,3496,1165,3315,4061,4062, 322,2153, 881, 455,1695,1152,1340, # 2848
|
||||
660, 554,2154,4649,1058,4650,4307, 830,1065,3383,4063,4651,1924,5342,1703,1919, # 2864
|
||||
5343, 932,2273, 122,5344,4652, 947, 677,5345,3870,2637, 297,1906,1925,2274,4653, # 2880
|
||||
2322,3316,5346,5347,4308,5348,4309, 84,4310, 112, 989,5349, 547,1059,4064, 701, # 2896
|
||||
3613,1019,5350,4311,5351,3497, 942, 639, 457,2306,2456, 993,2966, 407, 851, 494, # 2912
|
||||
4654,3384, 927,5352,1237,5353,2426,3385, 573,4312, 680, 921,2925,1279,1875, 285, # 2928
|
||||
790,1448,1984, 719,2168,5354,5355,4655,4065,4066,1649,5356,1541, 563,5357,1077, # 2944
|
||||
5358,3386,3061,3498, 511,3015,4067,4068,3733,4069,1268,2572,3387,3238,4656,4657, # 2960
|
||||
5359, 535,1048,1276,1189,2926,2029,3167,1438,1373,2847,2967,1134,2013,5360,4313, # 2976
|
||||
1238,2586,3109,1259,5361, 700,5362,2968,3168,3734,4314,5363,4315,1146,1876,1907, # 2992
|
||||
4658,2611,4070, 781,2427, 132,1589, 203, 147, 273,2802,2407, 898,1787,2155,4071, # 3008
|
||||
4072,5364,3871,2803,5365,5366,4659,4660,5367,3239,5368,1635,3872, 965,5369,1805, # 3024
|
||||
2699,1516,3614,1121,1082,1329,3317,4073,1449,3873, 65,1128,2848,2927,2769,1590, # 3040
|
||||
3874,5370,5371, 12,2668, 45, 976,2587,3169,4661, 517,2535,1013,1037,3240,5372, # 3056
|
||||
3875,2849,5373,3876,5374,3499,5375,2612, 614,1999,2323,3877,3110,2733,2638,5376, # 3072
|
||||
2588,4316, 599,1269,5377,1811,3735,5378,2700,3111, 759,1060, 489,1806,3388,3318, # 3088
|
||||
1358,5379,5380,2391,1387,1215,2639,2256, 490,5381,5382,4317,1759,2392,2348,5383, # 3104
|
||||
4662,3878,1908,4074,2640,1807,3241,4663,3500,3319,2770,2349, 874,5384,5385,3501, # 3120
|
||||
3736,1859, 91,2928,3737,3062,3879,4664,5386,3170,4075,2669,5387,3502,1202,1403, # 3136
|
||||
3880,2969,2536,1517,2510,4665,3503,2511,5388,4666,5389,2701,1886,1495,1731,4076, # 3152
|
||||
2370,4667,5390,2030,5391,5392,4077,2702,1216, 237,2589,4318,2324,4078,3881,4668, # 3168
|
||||
4669,2703,3615,3504, 445,4670,5393,5394,5395,5396,2771, 61,4079,3738,1823,4080, # 3184
|
||||
5397, 687,2046, 935, 925, 405,2670, 703,1096,1860,2734,4671,4081,1877,1367,2704, # 3200
|
||||
3389, 918,2106,1782,2483, 334,3320,1611,1093,4672, 564,3171,3505,3739,3390, 945, # 3216
|
||||
2641,2058,4673,5398,1926, 872,4319,5399,3506,2705,3112, 349,4320,3740,4082,4674, # 3232
|
||||
3882,4321,3741,2156,4083,4675,4676,4322,4677,2408,2047, 782,4084, 400, 251,4323, # 3248
|
||||
1624,5400,5401, 277,3742, 299,1265, 476,1191,3883,2122,4324,4325,1109, 205,5402, # 3264
|
||||
2590,1000,2157,3616,1861,5403,5404,5405,4678,5406,4679,2573, 107,2484,2158,4085, # 3280
|
||||
3507,3172,5407,1533, 541,1301, 158, 753,4326,2886,3617,5408,1696, 370,1088,4327, # 3296
|
||||
4680,3618, 579, 327, 440, 162,2244, 269,1938,1374,3508, 968,3063, 56,1396,3113, # 3312
|
||||
2107,3321,3391,5409,1927,2159,4681,3016,5410,3619,5411,5412,3743,4682,2485,5413, # 3328
|
||||
2804,5414,1650,4683,5415,2613,5416,5417,4086,2671,3392,1149,3393,4087,3884,4088, # 3344
|
||||
5418,1076, 49,5419, 951,3242,3322,3323, 450,2850, 920,5420,1812,2805,2371,4328, # 3360
|
||||
1909,1138,2372,3885,3509,5421,3243,4684,1910,1147,1518,2428,4685,3886,5422,4686, # 3376
|
||||
2393,2614, 260,1796,3244,5423,5424,3887,3324, 708,5425,3620,1704,5426,3621,1351, # 3392
|
||||
1618,3394,3017,1887, 944,4329,3395,4330,3064,3396,4331,5427,3744, 422, 413,1714, # 3408
|
||||
3325, 500,2059,2350,4332,2486,5428,1344,1911, 954,5429,1668,5430,5431,4089,2409, # 3424
|
||||
4333,3622,3888,4334,5432,2307,1318,2512,3114, 133,3115,2887,4687, 629, 31,2851, # 3440
|
||||
2706,3889,4688, 850, 949,4689,4090,2970,1732,2089,4335,1496,1853,5433,4091, 620, # 3456
|
||||
3245, 981,1242,3745,3397,1619,3746,1643,3326,2140,2457,1971,1719,3510,2169,5434, # 3472
|
||||
3246,5435,5436,3398,1829,5437,1277,4690,1565,2048,5438,1636,3623,3116,5439, 869, # 3488
|
||||
2852, 655,3890,3891,3117,4092,3018,3892,1310,3624,4691,5440,5441,5442,1733, 558, # 3504
|
||||
4692,3747, 335,1549,3065,1756,4336,3748,1946,3511,1830,1291,1192, 470,2735,2108, # 3520
|
||||
2806, 913,1054,4093,5443,1027,5444,3066,4094,4693, 982,2672,3399,3173,3512,3247, # 3536
|
||||
3248,1947,2807,5445, 571,4694,5446,1831,5447,3625,2591,1523,2429,5448,2090, 984, # 3552
|
||||
4695,3749,1960,5449,3750, 852, 923,2808,3513,3751, 969,1519, 999,2049,2325,1705, # 3568
|
||||
5450,3118, 615,1662, 151, 597,4095,2410,2326,1049, 275,4696,3752,4337, 568,3753, # 3584
|
||||
3626,2487,4338,3754,5451,2430,2275, 409,3249,5452,1566,2888,3514,1002, 769,2853, # 3600
|
||||
194,2091,3174,3755,2226,3327,4339, 628,1505,5453,5454,1763,2180,3019,4096, 521, # 3616
|
||||
1161,2592,1788,2206,2411,4697,4097,1625,4340,4341, 412, 42,3119, 464,5455,2642, # 3632
|
||||
4698,3400,1760,1571,2889,3515,2537,1219,2207,3893,2643,2141,2373,4699,4700,3328, # 3648
|
||||
1651,3401,3627,5456,5457,3628,2488,3516,5458,3756,5459,5460,2276,2092, 460,5461, # 3664
|
||||
4701,5462,3020, 962, 588,3629, 289,3250,2644,1116, 52,5463,3067,1797,5464,5465, # 3680
|
||||
5466,1467,5467,1598,1143,3757,4342,1985,1734,1067,4702,1280,3402, 465,4703,1572, # 3696
|
||||
510,5468,1928,2245,1813,1644,3630,5469,4704,3758,5470,5471,2673,1573,1534,5472, # 3712
|
||||
5473, 536,1808,1761,3517,3894,3175,2645,5474,5475,5476,4705,3518,2929,1912,2809, # 3728
|
||||
5477,3329,1122, 377,3251,5478, 360,5479,5480,4343,1529, 551,5481,2060,3759,1769, # 3744
|
||||
2431,5482,2930,4344,3330,3120,2327,2109,2031,4706,1404, 136,1468,1479, 672,1171, # 3760
|
||||
3252,2308, 271,3176,5483,2772,5484,2050, 678,2736, 865,1948,4707,5485,2014,4098, # 3776
|
||||
2971,5486,2737,2227,1397,3068,3760,4708,4709,1735,2931,3403,3631,5487,3895, 509, # 3792
|
||||
2854,2458,2890,3896,5488,5489,3177,3178,4710,4345,2538,4711,2309,1166,1010, 552, # 3808
|
||||
681,1888,5490,5491,2972,2973,4099,1287,1596,1862,3179, 358, 453, 736, 175, 478, # 3824
|
||||
1117, 905,1167,1097,5492,1854,1530,5493,1706,5494,2181,3519,2292,3761,3520,3632, # 3840
|
||||
4346,2093,4347,5495,3404,1193,2489,4348,1458,2193,2208,1863,1889,1421,3331,2932, # 3856
|
||||
3069,2182,3521, 595,2123,5496,4100,5497,5498,4349,1707,2646, 223,3762,1359, 751, # 3872
|
||||
3121, 183,3522,5499,2810,3021, 419,2374, 633, 704,3897,2394, 241,5500,5501,5502, # 3888
|
||||
838,3022,3763,2277,2773,2459,3898,1939,2051,4101,1309,3122,2246,1181,5503,1136, # 3904
|
||||
2209,3899,2375,1446,4350,2310,4712,5504,5505,4351,1055,2615, 484,3764,5506,4102, # 3920
|
||||
625,4352,2278,3405,1499,4353,4103,5507,4104,4354,3253,2279,2280,3523,5508,5509, # 3936
|
||||
2774, 808,2616,3765,3406,4105,4355,3123,2539, 526,3407,3900,4356, 955,5510,1620, # 3952
|
||||
4357,2647,2432,5511,1429,3766,1669,1832, 994, 928,5512,3633,1260,5513,5514,5515, # 3968
|
||||
1949,2293, 741,2933,1626,4358,2738,2460, 867,1184, 362,3408,1392,5516,5517,4106, # 3984
|
||||
4359,1770,1736,3254,2934,4713,4714,1929,2707,1459,1158,5518,3070,3409,2891,1292, # 4000
|
||||
1930,2513,2855,3767,1986,1187,2072,2015,2617,4360,5519,2574,2514,2170,3768,2490, # 4016
|
||||
3332,5520,3769,4715,5521,5522, 666,1003,3023,1022,3634,4361,5523,4716,1814,2257, # 4032
|
||||
574,3901,1603, 295,1535, 705,3902,4362, 283, 858, 417,5524,5525,3255,4717,4718, # 4048
|
||||
3071,1220,1890,1046,2281,2461,4107,1393,1599, 689,2575, 388,4363,5526,2491, 802, # 4064
|
||||
5527,2811,3903,2061,1405,2258,5528,4719,3904,2110,1052,1345,3256,1585,5529, 809, # 4080
|
||||
5530,5531,5532, 575,2739,3524, 956,1552,1469,1144,2328,5533,2329,1560,2462,3635, # 4096
|
||||
3257,4108, 616,2210,4364,3180,2183,2294,5534,1833,5535,3525,4720,5536,1319,3770, # 4112
|
||||
3771,1211,3636,1023,3258,1293,2812,5537,5538,5539,3905, 607,2311,3906, 762,2892, # 4128
|
||||
1439,4365,1360,4721,1485,3072,5540,4722,1038,4366,1450,2062,2648,4367,1379,4723, # 4144
|
||||
2593,5541,5542,4368,1352,1414,2330,2935,1172,5543,5544,3907,3908,4724,1798,1451, # 4160
|
||||
5545,5546,5547,5548,2936,4109,4110,2492,2351, 411,4111,4112,3637,3333,3124,4725, # 4176
|
||||
1561,2674,1452,4113,1375,5549,5550, 47,2974, 316,5551,1406,1591,2937,3181,5552, # 4192
|
||||
1025,2142,3125,3182, 354,2740, 884,2228,4369,2412, 508,3772, 726,3638, 996,2433, # 4208
|
||||
3639, 729,5553, 392,2194,1453,4114,4726,3773,5554,5555,2463,3640,2618,1675,2813, # 4224
|
||||
919,2352,2975,2353,1270,4727,4115, 73,5556,5557, 647,5558,3259,2856,2259,1550, # 4240
|
||||
1346,3024,5559,1332, 883,3526,5560,5561,5562,5563,3334,2775,5564,1212, 831,1347, # 4256
|
||||
4370,4728,2331,3909,1864,3073, 720,3910,4729,4730,3911,5565,4371,5566,5567,4731, # 4272
|
||||
5568,5569,1799,4732,3774,2619,4733,3641,1645,2376,4734,5570,2938, 669,2211,2675, # 4288
|
||||
2434,5571,2893,5572,5573,1028,3260,5574,4372,2413,5575,2260,1353,5576,5577,4735, # 4304
|
||||
3183, 518,5578,4116,5579,4373,1961,5580,2143,4374,5581,5582,3025,2354,2355,3912, # 4320
|
||||
516,1834,1454,4117,2708,4375,4736,2229,2620,1972,1129,3642,5583,2776,5584,2976, # 4336
|
||||
1422, 577,1470,3026,1524,3410,5585,5586, 432,4376,3074,3527,5587,2594,1455,2515, # 4352
|
||||
2230,1973,1175,5588,1020,2741,4118,3528,4737,5589,2742,5590,1743,1361,3075,3529, # 4368
|
||||
2649,4119,4377,4738,2295, 895, 924,4378,2171, 331,2247,3076, 166,1627,3077,1098, # 4384
|
||||
5591,1232,2894,2231,3411,4739, 657, 403,1196,2377, 542,3775,3412,1600,4379,3530, # 4400
|
||||
5592,4740,2777,3261, 576, 530,1362,4741,4742,2540,2676,3776,4120,5593, 842,3913, # 4416
|
||||
5594,2814,2032,1014,4121, 213,2709,3413, 665, 621,4380,5595,3777,2939,2435,5596, # 4432
|
||||
2436,3335,3643,3414,4743,4381,2541,4382,4744,3644,1682,4383,3531,1380,5597, 724, # 4448
|
||||
2282, 600,1670,5598,1337,1233,4745,3126,2248,5599,1621,4746,5600, 651,4384,5601, # 4464
|
||||
1612,4385,2621,5602,2857,5603,2743,2312,3078,5604, 716,2464,3079, 174,1255,2710, # 4480
|
||||
4122,3645, 548,1320,1398, 728,4123,1574,5605,1891,1197,3080,4124,5606,3081,3082, # 4496
|
||||
3778,3646,3779, 747,5607, 635,4386,4747,5608,5609,5610,4387,5611,5612,4748,5613, # 4512
|
||||
3415,4749,2437, 451,5614,3780,2542,2073,4388,2744,4389,4125,5615,1764,4750,5616, # 4528
|
||||
4390, 350,4751,2283,2395,2493,5617,4391,4126,2249,1434,4127, 488,4752, 458,4392, # 4544
|
||||
4128,3781, 771,1330,2396,3914,2576,3184,2160,2414,1553,2677,3185,4393,5618,2494, # 4560
|
||||
2895,2622,1720,2711,4394,3416,4753,5619,2543,4395,5620,3262,4396,2778,5621,2016, # 4576
|
||||
2745,5622,1155,1017,3782,3915,5623,3336,2313, 201,1865,4397,1430,5624,4129,5625, # 4592
|
||||
5626,5627,5628,5629,4398,1604,5630, 414,1866, 371,2595,4754,4755,3532,2017,3127, # 4608
|
||||
4756,1708, 960,4399, 887, 389,2172,1536,1663,1721,5631,2232,4130,2356,2940,1580, # 4624
|
||||
5632,5633,1744,4757,2544,4758,4759,5634,4760,5635,2074,5636,4761,3647,3417,2896, # 4640
|
||||
4400,5637,4401,2650,3418,2815, 673,2712,2465, 709,3533,4131,3648,4402,5638,1148, # 4656
|
||||
502, 634,5639,5640,1204,4762,3649,1575,4763,2623,3783,5641,3784,3128, 948,3263, # 4672
|
||||
121,1745,3916,1110,5642,4403,3083,2516,3027,4132,3785,1151,1771,3917,1488,4133, # 4688
|
||||
1987,5643,2438,3534,5644,5645,2094,5646,4404,3918,1213,1407,2816, 531,2746,2545, # 4704
|
||||
3264,1011,1537,4764,2779,4405,3129,1061,5647,3786,3787,1867,2897,5648,2018, 120, # 4720
|
||||
4406,4407,2063,3650,3265,2314,3919,2678,3419,1955,4765,4134,5649,3535,1047,2713, # 4736
|
||||
1266,5650,1368,4766,2858, 649,3420,3920,2546,2747,1102,2859,2679,5651,5652,2000, # 4752
|
||||
5653,1111,3651,2977,5654,2495,3921,3652,2817,1855,3421,3788,5655,5656,3422,2415, # 4768
|
||||
2898,3337,3266,3653,5657,2577,5658,3654,2818,4135,1460, 856,5659,3655,5660,2899, # 4784
|
||||
2978,5661,2900,3922,5662,4408, 632,2517, 875,3923,1697,3924,2296,5663,5664,4767, # 4800
|
||||
3028,1239, 580,4768,4409,5665, 914, 936,2075,1190,4136,1039,2124,5666,5667,5668, # 4816
|
||||
5669,3423,1473,5670,1354,4410,3925,4769,2173,3084,4137, 915,3338,4411,4412,3339, # 4832
|
||||
1605,1835,5671,2748, 398,3656,4413,3926,4138, 328,1913,2860,4139,3927,1331,4414, # 4848
|
||||
3029, 937,4415,5672,3657,4140,4141,3424,2161,4770,3425, 524, 742, 538,3085,1012, # 4864
|
||||
5673,5674,3928,2466,5675, 658,1103, 225,3929,5676,5677,4771,5678,4772,5679,3267, # 4880
|
||||
1243,5680,4142, 963,2250,4773,5681,2714,3658,3186,5682,5683,2596,2332,5684,4774, # 4896
|
||||
5685,5686,5687,3536, 957,3426,2547,2033,1931,2941,2467, 870,2019,3659,1746,2780, # 4912
|
||||
2781,2439,2468,5688,3930,5689,3789,3130,3790,3537,3427,3791,5690,1179,3086,5691, # 4928
|
||||
3187,2378,4416,3792,2548,3188,3131,2749,4143,5692,3428,1556,2549,2297, 977,2901, # 4944
|
||||
2034,4144,1205,3429,5693,1765,3430,3189,2125,1271, 714,1689,4775,3538,5694,2333, # 4960
|
||||
3931, 533,4417,3660,2184, 617,5695,2469,3340,3539,2315,5696,5697,3190,5698,5699, # 4976
|
||||
3932,1988, 618, 427,2651,3540,3431,5700,5701,1244,1690,5702,2819,4418,4776,5703, # 4992
|
||||
3541,4777,5704,2284,1576, 473,3661,4419,3432, 972,5705,3662,5706,3087,5707,5708, # 5008
|
||||
4778,4779,5709,3793,4145,4146,5710, 153,4780, 356,5711,1892,2902,4420,2144, 408, # 5024
|
||||
803,2357,5712,3933,5713,4421,1646,2578,2518,4781,4782,3934,5714,3935,4422,5715, # 5040
|
||||
2416,3433, 752,5716,5717,1962,3341,2979,5718, 746,3030,2470,4783,4423,3794, 698, # 5056
|
||||
4784,1893,4424,3663,2550,4785,3664,3936,5719,3191,3434,5720,1824,1302,4147,2715, # 5072
|
||||
3937,1974,4425,5721,4426,3192, 823,1303,1288,1236,2861,3542,4148,3435, 774,3938, # 5088
|
||||
5722,1581,4786,1304,2862,3939,4787,5723,2440,2162,1083,3268,4427,4149,4428, 344, # 5104
|
||||
1173, 288,2316, 454,1683,5724,5725,1461,4788,4150,2597,5726,5727,4789, 985, 894, # 5120
|
||||
5728,3436,3193,5729,1914,2942,3795,1989,5730,2111,1975,5731,4151,5732,2579,1194, # 5136
|
||||
425,5733,4790,3194,1245,3796,4429,5734,5735,2863,5736, 636,4791,1856,3940, 760, # 5152
|
||||
1800,5737,4430,2212,1508,4792,4152,1894,1684,2298,5738,5739,4793,4431,4432,2213, # 5168
|
||||
479,5740,5741, 832,5742,4153,2496,5743,2980,2497,3797, 990,3132, 627,1815,2652, # 5184
|
||||
4433,1582,4434,2126,2112,3543,4794,5744, 799,4435,3195,5745,4795,2113,1737,3031, # 5200
|
||||
1018, 543, 754,4436,3342,1676,4796,4797,4154,4798,1489,5746,3544,5747,2624,2903, # 5216
|
||||
4155,5748,5749,2981,5750,5751,5752,5753,3196,4799,4800,2185,1722,5754,3269,3270, # 5232
|
||||
1843,3665,1715, 481, 365,1976,1857,5755,5756,1963,2498,4801,5757,2127,3666,3271, # 5248
|
||||
433,1895,2064,2076,5758, 602,2750,5759,5760,5761,5762,5763,3032,1628,3437,5764, # 5264
|
||||
3197,4802,4156,2904,4803,2519,5765,2551,2782,5766,5767,5768,3343,4804,2905,5769, # 5280
|
||||
4805,5770,2864,4806,4807,1221,2982,4157,2520,5771,5772,5773,1868,1990,5774,5775, # 5296
|
||||
5776,1896,5777,5778,4808,1897,4158, 318,5779,2095,4159,4437,5780,5781, 485,5782, # 5312
|
||||
938,3941, 553,2680, 116,5783,3942,3667,5784,3545,2681,2783,3438,3344,2820,5785, # 5328
|
||||
3668,2943,4160,1747,2944,2983,5786,5787, 207,5788,4809,5789,4810,2521,5790,3033, # 5344
|
||||
890,3669,3943,5791,1878,3798,3439,5792,2186,2358,3440,1652,5793,5794,5795, 941, # 5360
|
||||
2299, 208,3546,4161,2020, 330,4438,3944,2906,2499,3799,4439,4811,5796,5797,5798, # 5376
|
||||
)
|
||||
|
|
@ -1,107 +0,0 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is Mozilla Communicator client code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 1998
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .enums import ProbingState
|
||||
from .charsetprober import CharSetProber
|
||||
|
||||
|
||||
class CharSetGroupProber(CharSetProber):
|
||||
def __init__(self, lang_filter=None):
|
||||
super(CharSetGroupProber, self).__init__(lang_filter=lang_filter)
|
||||
self._active_num = 0
|
||||
self.probers = []
|
||||
self._best_guess_prober = None
|
||||
|
||||
def reset(self):
|
||||
super(CharSetGroupProber, self).reset()
|
||||
self._active_num = 0
|
||||
for prober in self.probers:
|
||||
if prober:
|
||||
prober.reset()
|
||||
prober.active = True
|
||||
self._active_num += 1
|
||||
self._best_guess_prober = None
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
if not self._best_guess_prober:
|
||||
self.get_confidence()
|
||||
if not self._best_guess_prober:
|
||||
return None
|
||||
return self._best_guess_prober.charset_name
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
if not self._best_guess_prober:
|
||||
self.get_confidence()
|
||||
if not self._best_guess_prober:
|
||||
return None
|
||||
return self._best_guess_prober.language
|
||||
|
||||
def feed(self, byte_str):
|
||||
for prober in self.probers:
|
||||
if not prober:
|
||||
continue
|
||||
if not prober.active:
|
||||
continue
|
||||
state = prober.feed(byte_str)
|
||||
if not state:
|
||||
continue
|
||||
if state == ProbingState.FOUND_IT:
|
||||
self._best_guess_prober = prober
|
||||
self._state = ProbingState.FOUND_IT
|
||||
return self.state
|
||||
elif state == ProbingState.NOT_ME:
|
||||
prober.active = False
|
||||
self._active_num -= 1
|
||||
if self._active_num <= 0:
|
||||
self._state = ProbingState.NOT_ME
|
||||
return self.state
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
state = self.state
|
||||
if state == ProbingState.FOUND_IT:
|
||||
return 0.99
|
||||
elif state == ProbingState.NOT_ME:
|
||||
return 0.01
|
||||
best_conf = 0.0
|
||||
self._best_guess_prober = None
|
||||
for prober in self.probers:
|
||||
if not prober:
|
||||
continue
|
||||
if not prober.active:
|
||||
self.logger.debug('%s not active', prober.charset_name)
|
||||
continue
|
||||
conf = prober.get_confidence()
|
||||
self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf)
|
||||
if best_conf < conf:
|
||||
best_conf = conf
|
||||
self._best_guess_prober = prober
|
||||
if not self._best_guess_prober:
|
||||
return 0.0
|
||||
return best_conf
|
|
@ -1,145 +0,0 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
# Shy Shalom - original C code
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import logging
|
||||
import re
|
||||
|
||||
from .enums import ProbingState
|
||||
|
||||
|
||||
class CharSetProber(object):
|
||||
|
||||
SHORTCUT_THRESHOLD = 0.95
|
||||
|
||||
def __init__(self, lang_filter=None):
|
||||
self._state = None
|
||||
self.lang_filter = lang_filter
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def reset(self):
|
||||
self._state = ProbingState.DETECTING
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
return None
|
||||
|
||||
def feed(self, buf):
|
||||
pass
|
||||
|
||||
@property
|
||||
def state(self):
|
||||
return self._state
|
||||
|
||||
def get_confidence(self):
|
||||
return 0.0
|
||||
|
||||
@staticmethod
|
||||
def filter_high_byte_only(buf):
|
||||
buf = re.sub(b'([\x00-\x7F])+', b' ', buf)
|
||||
return buf
|
||||
|
||||
@staticmethod
|
||||
def filter_international_words(buf):
|
||||
"""
|
||||
We define three types of bytes:
|
||||
alphabet: english alphabets [a-zA-Z]
|
||||
international: international characters [\x80-\xFF]
|
||||
marker: everything else [^a-zA-Z\x80-\xFF]
|
||||
|
||||
The input buffer can be thought to contain a series of words delimited
|
||||
by markers. This function works to filter all words that contain at
|
||||
least one international character. All contiguous sequences of markers
|
||||
are replaced by a single space ascii character.
|
||||
|
||||
This filter applies to all scripts which do not use English characters.
|
||||
"""
|
||||
filtered = bytearray()
|
||||
|
||||
# This regex expression filters out only words that have at-least one
|
||||
# international character. The word may include one marker character at
|
||||
# the end.
|
||||
words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
|
||||
buf)
|
||||
|
||||
for word in words:
|
||||
filtered.extend(word[:-1])
|
||||
|
||||
# If the last character in the word is a marker, replace it with a
|
||||
# space as markers shouldn't affect our analysis (they are used
|
||||
# similarly across all languages and may thus have similar
|
||||
# frequencies).
|
||||
last_char = word[-1:]
|
||||
if not last_char.isalpha() and last_char < b'\x80':
|
||||
last_char = b' '
|
||||
filtered.extend(last_char)
|
||||
|
||||
return filtered
|
||||
|
||||
@staticmethod
|
||||
def filter_with_english_letters(buf):
|
||||
"""
|
||||
Returns a copy of ``buf`` that retains only the sequences of English
|
||||
alphabet and high byte characters that are not between <> characters.
|
||||
Also retains English alphabet and high byte characters immediately
|
||||
before occurrences of >.
|
||||
|
||||
This filter can be applied to all scripts which contain both English
|
||||
characters and extended ASCII characters, but is currently only used by
|
||||
``Latin1Prober``.
|
||||
"""
|
||||
filtered = bytearray()
|
||||
in_tag = False
|
||||
prev = 0
|
||||
|
||||
for curr in range(len(buf)):
|
||||
# Slice here to get bytes instead of an int with Python 3
|
||||
buf_char = buf[curr:curr + 1]
|
||||
# Check if we're coming out of or entering an HTML tag
|
||||
if buf_char == b'>':
|
||||
in_tag = False
|
||||
elif buf_char == b'<':
|
||||
in_tag = True
|
||||
|
||||
# If current character is not extended-ASCII and not alphabetic...
|
||||
if buf_char < b'\x80' and not buf_char.isalpha():
|
||||
# ...and we're not in a tag
|
||||
if curr > prev and not in_tag:
|
||||
# Keep everything after last non-extended-ASCII,
|
||||
# non-alphabetic character
|
||||
filtered.extend(buf[prev:curr])
|
||||
# Output a space to delimit stretch we kept
|
||||
filtered.extend(b' ')
|
||||
prev = curr + 1
|
||||
|
||||
# If we're not in a tag...
|
||||
if not in_tag:
|
||||
# Keep everything after last non-extended-ASCII, non-alphabetic
|
||||
# character
|
||||
filtered.extend(buf[prev:])
|
||||
|
||||
return filtered
|
|
@ -1 +0,0 @@
|
|||
|
|
@ -1,88 +0,0 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is mozilla.org code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 1998
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import logging
|
||||
|
||||
from .enums import MachineState
|
||||
|
||||
|
||||
class CodingStateMachine(object):
|
||||
"""
|
||||
A state machine to verify a byte sequence for a particular encoding. For
|
||||
each byte the detector receives, it will feed that byte to every active
|
||||
state machine available, one byte at a time. The state machine changes its
|
||||
state based on its previous state and the byte it receives. There are 3
|
||||
states in a state machine that are of interest to an auto-detector:
|
||||
|
||||
START state: This is the state to start with, or a legal byte sequence
|
||||
(i.e. a valid code point) for character has been identified.
|
||||
|
||||
ME state: This indicates that the state machine identified a byte sequence
|
||||
that is specific to the charset it is designed for and that
|
||||
there is no other possible encoding which can contain this byte
|
||||
sequence. This will to lead to an immediate positive answer for
|
||||
the detector.
|
||||
|
||||
ERROR state: This indicates the state machine identified an illegal byte
|
||||
sequence for that encoding. This will lead to an immediate
|
||||
negative answer for this encoding. Detector will exclude this
|
||||
encoding from consideration from here on.
|
||||
"""
|
||||
def __init__(self, sm):
|
||||
self._model = sm
|
||||
self._curr_byte_pos = 0
|
||||
self._curr_char_len = 0
|
||||
self._curr_state = None
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self._curr_state = MachineState.START
|
||||
|
||||
def next_state(self, c):
|
||||
# for each byte we get its class
|
||||
# if it is first byte, we also get byte length
|
||||
byte_class = self._model['class_table'][c]
|
||||
if self._curr_state == MachineState.START:
|
||||
self._curr_byte_pos = 0
|
||||
self._curr_char_len = self._model['char_len_table'][byte_class]
|
||||
# from byte's class and state_table, we get its next state
|
||||
curr_state = (self._curr_state * self._model['class_factor']
|
||||
+ byte_class)
|
||||
self._curr_state = self._model['state_table'][curr_state]
|
||||
self._curr_byte_pos += 1
|
||||
return self._curr_state
|
||||
|
||||
def get_current_charlen(self):
|
||||
return self._curr_char_len
|
||||
|
||||
def get_coding_state_machine(self):
|
||||
return self._model['name']
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
return self._model['language']
|
|
@ -1,76 +0,0 @@
|
|||
"""
|
||||
All of the Enums that are used throughout the chardet package.
|
||||
|
||||
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
||||
"""
|
||||
|
||||
|
||||
class InputState(object):
|
||||
"""
|
||||
This enum represents the different states a universal detector can be in.
|
||||
"""
|
||||
PURE_ASCII = 0
|
||||
ESC_ASCII = 1
|
||||
HIGH_BYTE = 2
|
||||
|
||||
|
||||
class LanguageFilter(object):
|
||||
"""
|
||||
This enum represents the different language filters we can apply to a
|
||||
``UniversalDetector``.
|
||||
"""
|
||||
CHINESE_SIMPLIFIED = 0x01
|
||||
CHINESE_TRADITIONAL = 0x02
|
||||
JAPANESE = 0x04
|
||||
KOREAN = 0x08
|
||||
NON_CJK = 0x10
|
||||
ALL = 0x1F
|
||||
CHINESE = CHINESE_SIMPLIFIED | CHINESE_TRADITIONAL
|
||||
CJK = CHINESE | JAPANESE | KOREAN
|
||||
|
||||
|
||||
class ProbingState(object):
|
||||
"""
|
||||
This enum represents the different states a prober can be in.
|
||||
"""
|
||||
DETECTING = 0
|
||||
FOUND_IT = 1
|
||||
NOT_ME = 2
|
||||
|
||||
|
||||
class MachineState(object):
|
||||
"""
|
||||
This enum represents the different states a state machine can be in.
|
||||
"""
|
||||
START = 0
|
||||
ERROR = 1
|
||||
ITS_ME = 2
|
||||
|
||||
|
||||
class SequenceLikelihood(object):
|
||||
"""
|
||||
This enum represents the likelihood of a character following the previous one.
|
||||
"""
|
||||
NEGATIVE = 0
|
||||
UNLIKELY = 1
|
||||
LIKELY = 2
|
||||
POSITIVE = 3
|
||||
|
||||
@classmethod
|
||||
def get_num_categories(cls):
|
||||
""":returns: The number of likelihood categories in the enum."""
|
||||
return 4
|
||||
|
||||
|
||||
class CharacterCategory(object):
|
||||
"""
|
||||
This enum represents the different categories language models for
|
||||
``SingleByteCharsetProber`` put characters into.
|
||||
|
||||
Anything less than CONTROL is considered a letter.
|
||||
"""
|
||||
UNDEFINED = 255
|
||||
LINE_BREAK = 254
|
||||
SYMBOL = 253
|
||||
DIGIT = 252
|
||||
CONTROL = 251
|
|
@ -1,101 +0,0 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is mozilla.org code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 1998
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .enums import LanguageFilter, ProbingState, MachineState
|
||||
from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL,
|
||||
ISO2022KR_SM_MODEL)
|
||||
|
||||
|
||||
class EscCharSetProber(CharSetProber):
|
||||
"""
|
||||
This CharSetProber uses a "code scheme" approach for detecting encodings,
|
||||
whereby easily recognizable escape or shift sequences are relied on to
|
||||
identify these encodings.
|
||||
"""
|
||||
|
||||
def __init__(self, lang_filter=None):
|
||||
super(EscCharSetProber, self).__init__(lang_filter=lang_filter)
|
||||
self.coding_sm = []
|
||||
if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
|
||||
self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))
|
||||
self.coding_sm.append(CodingStateMachine(ISO2022CN_SM_MODEL))
|
||||
if self.lang_filter & LanguageFilter.JAPANESE:
|
||||
self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
|
||||
if self.lang_filter & LanguageFilter.KOREAN:
|
||||
self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
|
||||
self.active_sm_count = None
|
||||
self._detected_charset = None
|
||||
self._detected_language = None
|
||||
self._state = None
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
super(EscCharSetProber, self).reset()
|
||||
for coding_sm in self.coding_sm:
|
||||
if not coding_sm:
|
||||
continue
|
||||
coding_sm.active = True
|
||||
coding_sm.reset()
|
||||
self.active_sm_count = len(self.coding_sm)
|
||||
self._detected_charset = None
|
||||
self._detected_language = None
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
return self._detected_charset
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
return self._detected_language
|
||||
|
||||
def get_confidence(self):
|
||||
if self._detected_charset:
|
||||
return 0.99
|
||||
else:
|
||||
return 0.00
|
||||
|
||||
def feed(self, byte_str):
|
||||
for c in byte_str:
|
||||
for coding_sm in self.coding_sm:
|
||||
if not coding_sm or not coding_sm.active:
|
||||
continue
|
||||
coding_state = coding_sm.next_state(c)
|
||||
if coding_state == MachineState.ERROR:
|
||||
coding_sm.active = False
|
||||
self.active_sm_count -= 1
|
||||
if self.active_sm_count <= 0:
|
||||
self._state = ProbingState.NOT_ME
|
||||
return self.state
|
||||
elif coding_state == MachineState.ITS_ME:
|
||||
self._state = ProbingState.FOUND_IT
|
||||
self._detected_charset = coding_sm.get_coding_state_machine()
|
||||
self._detected_language = coding_sm.language
|
||||
return self.state
|
||||
|
||||
return self.state
|
|
@ -1,246 +0,0 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is mozilla.org code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 1998
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .enums import MachineState
|
||||
|
||||
HZ_CLS = (
|
||||
1,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,0,0,0,0, # 20 - 27
|
||||
0,0,0,0,0,0,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,0,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,4,0,5,2,0, # 78 - 7f
|
||||
1,1,1,1,1,1,1,1, # 80 - 87
|
||||
1,1,1,1,1,1,1,1, # 88 - 8f
|
||||
1,1,1,1,1,1,1,1, # 90 - 97
|
||||
1,1,1,1,1,1,1,1, # 98 - 9f
|
||||
1,1,1,1,1,1,1,1, # a0 - a7
|
||||
1,1,1,1,1,1,1,1, # a8 - af
|
||||
1,1,1,1,1,1,1,1, # b0 - b7
|
||||
1,1,1,1,1,1,1,1, # b8 - bf
|
||||
1,1,1,1,1,1,1,1, # c0 - c7
|
||||
1,1,1,1,1,1,1,1, # c8 - cf
|
||||
1,1,1,1,1,1,1,1, # d0 - d7
|
||||
1,1,1,1,1,1,1,1, # d8 - df
|
||||
1,1,1,1,1,1,1,1, # e0 - e7
|
||||
1,1,1,1,1,1,1,1, # e8 - ef
|
||||
1,1,1,1,1,1,1,1, # f0 - f7
|
||||
1,1,1,1,1,1,1,1, # f8 - ff
|
||||
)
|
||||
|
||||
HZ_ST = (
|
||||
MachineState.START,MachineState.ERROR, 3,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START, 4,MachineState.ERROR,# 10-17
|
||||
5,MachineState.ERROR, 6,MachineState.ERROR, 5, 5, 4,MachineState.ERROR,# 18-1f
|
||||
4,MachineState.ERROR, 4, 4, 4,MachineState.ERROR, 4,MachineState.ERROR,# 20-27
|
||||
4,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 28-2f
|
||||
)
|
||||
|
||||
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
||||
|
||||
HZ_SM_MODEL = {'class_table': HZ_CLS,
|
||||
'class_factor': 6,
|
||||
'state_table': HZ_ST,
|
||||
'char_len_table': HZ_CHAR_LEN_TABLE,
|
||||
'name': "HZ-GB-2312",
|
||||
'language': 'Chinese'}
|
||||
|
||||
ISO2022CN_CLS = (
|
||||
2,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,0,0,0,0, # 20 - 27
|
||||
0,3,0,0,0,0,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,4,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
2,2,2,2,2,2,2,2, # 80 - 87
|
||||
2,2,2,2,2,2,2,2, # 88 - 8f
|
||||
2,2,2,2,2,2,2,2, # 90 - 97
|
||||
2,2,2,2,2,2,2,2, # 98 - 9f
|
||||
2,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
2,2,2,2,2,2,2,2, # e0 - e7
|
||||
2,2,2,2,2,2,2,2, # e8 - ef
|
||||
2,2,2,2,2,2,2,2, # f0 - f7
|
||||
2,2,2,2,2,2,2,2, # f8 - ff
|
||||
)
|
||||
|
||||
ISO2022CN_ST = (
|
||||
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
|
||||
MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,# 18-1f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 20-27
|
||||
5, 6,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 28-2f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 30-37
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,# 38-3f
|
||||
)
|
||||
|
||||
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||
|
||||
ISO2022CN_SM_MODEL = {'class_table': ISO2022CN_CLS,
|
||||
'class_factor': 9,
|
||||
'state_table': ISO2022CN_ST,
|
||||
'char_len_table': ISO2022CN_CHAR_LEN_TABLE,
|
||||
'name': "ISO-2022-CN",
|
||||
'language': 'Chinese'}
|
||||
|
||||
ISO2022JP_CLS = (
|
||||
2,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,2,2, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,7,0,0,0, # 20 - 27
|
||||
3,0,0,0,0,0,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
6,0,4,0,8,0,0,0, # 40 - 47
|
||||
0,9,5,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
2,2,2,2,2,2,2,2, # 80 - 87
|
||||
2,2,2,2,2,2,2,2, # 88 - 8f
|
||||
2,2,2,2,2,2,2,2, # 90 - 97
|
||||
2,2,2,2,2,2,2,2, # 98 - 9f
|
||||
2,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
2,2,2,2,2,2,2,2, # e0 - e7
|
||||
2,2,2,2,2,2,2,2, # e8 - ef
|
||||
2,2,2,2,2,2,2,2, # f0 - f7
|
||||
2,2,2,2,2,2,2,2, # f8 - ff
|
||||
)
|
||||
|
||||
ISO2022JP_ST = (
|
||||
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
|
||||
MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,# 18-1f
|
||||
MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 20-27
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 6,MachineState.ITS_ME,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,# 28-2f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,# 30-37
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 38-3f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.START,# 40-47
|
||||
)
|
||||
|
||||
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||
|
||||
ISO2022JP_SM_MODEL = {'class_table': ISO2022JP_CLS,
|
||||
'class_factor': 10,
|
||||
'state_table': ISO2022JP_ST,
|
||||
'char_len_table': ISO2022JP_CHAR_LEN_TABLE,
|
||||
'name': "ISO-2022-JP",
|
||||
'language': 'Japanese'}
|
||||
|
||||
ISO2022KR_CLS = (
|
||||
2,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,3,0,0,0, # 20 - 27
|
||||
0,4,0,0,0,0,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,5,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
2,2,2,2,2,2,2,2, # 80 - 87
|
||||
2,2,2,2,2,2,2,2, # 88 - 8f
|
||||
2,2,2,2,2,2,2,2, # 90 - 97
|
||||
2,2,2,2,2,2,2,2, # 98 - 9f
|
||||
2,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
2,2,2,2,2,2,2,2, # e0 - e7
|
||||
2,2,2,2,2,2,2,2, # e8 - ef
|
||||
2,2,2,2,2,2,2,2, # f0 - f7
|
||||
2,2,2,2,2,2,2,2, # f8 - ff
|
||||
)
|
||||
|
||||
ISO2022KR_ST = (
|
||||
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 10-17
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 18-1f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 20-27
|
||||
)
|
||||
|
||||
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
||||
|
||||
ISO2022KR_SM_MODEL = {'class_table': ISO2022KR_CLS,
|
||||
'class_factor': 6,
|
||||
'state_table': ISO2022KR_ST,
|
||||
'char_len_table': ISO2022KR_CHAR_LEN_TABLE,
|
||||
'name': "ISO-2022-KR",
|
||||
'language': 'Korean'}
|
||||
|
||||
|
|
@ -1,92 +0,0 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is mozilla.org code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 1998
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .enums import ProbingState, MachineState
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import EUCJPDistributionAnalysis
|
||||
from .jpcntx import EUCJPContextAnalysis
|
||||
from .mbcssm import EUCJP_SM_MODEL
|
||||
|
||||
|
||||
class EUCJPProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
super(EUCJPProber, self).__init__()
|
||||
self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)
|
||||
self.distribution_analyzer = EUCJPDistributionAnalysis()
|
||||
self.context_analyzer = EUCJPContextAnalysis()
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
super(EUCJPProber, self).reset()
|
||||
self.context_analyzer.reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
return "EUC-JP"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
return "Japanese"
|
||||
|
||||
def feed(self, byte_str):
|
||||
for i in range(len(byte_str)):
|
||||
# PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte
|
||||
coding_state = self.coding_sm.next_state(byte_str[i])
|
||||
if coding_state == MachineState.ERROR:
|
||||
self.logger.debug('%s %s prober hit error at byte %s',
|
||||
self.charset_name, self.language, i)
|
||||
self._state = ProbingState.NOT_ME
|
||||
break
|
||||
elif coding_state == MachineState.ITS_ME:
|
||||
self._state = ProbingState.FOUND_IT
|
||||
break
|
||||
elif coding_state == MachineState.START:
|
||||
char_len = self.coding_sm.get_current_charlen()
|
||||
if i == 0:
|
||||
self._last_char[1] = byte_str[0]
|
||||
self.context_analyzer.feed(self._last_char, char_len)
|
||||
self.distribution_analyzer.feed(self._last_char, char_len)
|
||||
else:
|
||||
self.context_analyzer.feed(byte_str[i - 1:i + 1],
|
||||
char_len)
|
||||
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
|
||||
char_len)
|
||||
|
||||
self._last_char[0] = byte_str[-1]
|
||||
|
||||
if self.state == ProbingState.DETECTING:
|
||||
if (self.context_analyzer.got_enough_data() and
|
||||
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
||||
self._state = ProbingState.FOUND_IT
|
||||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
context_conf = self.context_analyzer.get_confidence()
|
||||
distrib_conf = self.distribution_analyzer.get_confidence()
|
||||
return max(context_conf, distrib_conf)
|
|
@ -1,195 +0,0 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is Mozilla Communicator client code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 1998
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
# Sampling from about 20M text materials include literature and computer technology
|
||||
|
||||
# 128 --> 0.79
|
||||
# 256 --> 0.92
|
||||
# 512 --> 0.986
|
||||
# 1024 --> 0.99944
|
||||
# 2048 --> 0.99999
|
||||
#
|
||||
# Idea Distribution Ratio = 0.98653 / (1-0.98653) = 73.24
|
||||
# Random Distribution Ration = 512 / (2350-512) = 0.279.
|
||||
#
|
||||
# Typical Distribution Ratio
|
||||
|
||||
EUCKR_TYPICAL_DISTRIBUTION_RATIO = 6.0
|
||||
|
||||
EUCKR_TABLE_SIZE = 2352
|
||||
|
||||
# Char to FreqOrder table ,
|
||||
EUCKR_CHAR_TO_FREQ_ORDER = (
|
||||
13, 130, 120,1396, 481,1719,1720, 328, 609, 212,1721, 707, 400, 299,1722, 87,
|
||||
1397,1723, 104, 536,1117,1203,1724,1267, 685,1268, 508,1725,1726,1727,1728,1398,
|
||||
1399,1729,1730,1731, 141, 621, 326,1057, 368,1732, 267, 488, 20,1733,1269,1734,
|
||||
945,1400,1735, 47, 904,1270,1736,1737, 773, 248,1738, 409, 313, 786, 429,1739,
|
||||
116, 987, 813,1401, 683, 75,1204, 145,1740,1741,1742,1743, 16, 847, 667, 622,
|
||||
708,1744,1745,1746, 966, 787, 304, 129,1747, 60, 820, 123, 676,1748,1749,1750,
|
||||
1751, 617,1752, 626,1753,1754,1755,1756, 653,1757,1758,1759,1760,1761,1762, 856,
|
||||
344,1763,1764,1765,1766, 89, 401, 418, 806, 905, 848,1767,1768,1769, 946,1205,
|
||||
709,1770,1118,1771, 241,1772,1773,1774,1271,1775, 569,1776, 999,1777,1778,1779,
|
||||
1780, 337, 751,1058, 28, 628, 254,1781, 177, 906, 270, 349, 891,1079,1782, 19,
|
||||
1783, 379,1784, 315,1785, 629, 754,1402, 559,1786, 636, 203,1206,1787, 710, 567,
|
||||
1788, 935, 814,1789,1790,1207, 766, 528,1791,1792,1208,1793,1794,1795,1796,1797,
|
||||
1403,1798,1799, 533,1059,1404,1405,1156,1406, 936, 884,1080,1800, 351,1801,1802,
|
||||
1803,1804,1805, 801,1806,1807,1808,1119,1809,1157, 714, 474,1407,1810, 298, 899,
|
||||
885,1811,1120, 802,1158,1812, 892,1813,1814,1408, 659,1815,1816,1121,1817,1818,
|
||||
1819,1820,1821,1822, 319,1823, 594, 545,1824, 815, 937,1209,1825,1826, 573,1409,
|
||||
1022,1827,1210,1828,1829,1830,1831,1832,1833, 556, 722, 807,1122,1060,1834, 697,
|
||||
1835, 900, 557, 715,1836,1410, 540,1411, 752,1159, 294, 597,1211, 976, 803, 770,
|
||||
1412,1837,1838, 39, 794,1413, 358,1839, 371, 925,1840, 453, 661, 788, 531, 723,
|
||||
544,1023,1081, 869, 91,1841, 392, 430, 790, 602,1414, 677,1082, 457,1415,1416,
|
||||
1842,1843, 475, 327,1024,1417, 795, 121,1844, 733, 403,1418,1845,1846,1847, 300,
|
||||
119, 711,1212, 627,1848,1272, 207,1849,1850, 796,1213, 382,1851, 519,1852,1083,
|
||||
893,1853,1854,1855, 367, 809, 487, 671,1856, 663,1857,1858, 956, 471, 306, 857,
|
||||
1859,1860,1160,1084,1861,1862,1863,1864,1865,1061,1866,1867,1868,1869,1870,1871,
|
||||
282, 96, 574,1872, 502,1085,1873,1214,1874, 907,1875,1876, 827, 977,1419,1420,
|
||||
1421, 268,1877,1422,1878,1879,1880, 308,1881, 2, 537,1882,1883,1215,1884,1885,
|
||||
127, 791,1886,1273,1423,1887, 34, 336, 404, 643,1888, 571, 654, 894, 840,1889,
|
||||
0, 886,1274, 122, 575, 260, 908, 938,1890,1275, 410, 316,1891,1892, 100,1893,
|
||||
1894,1123, 48,1161,1124,1025,1895, 633, 901,1276,1896,1897, 115, 816,1898, 317,
|
||||
1899, 694,1900, 909, 734,1424, 572, 866,1425, 691, 85, 524,1010, 543, 394, 841,
|
||||
1901,1902,1903,1026,1904,1905,1906,1907,1908,1909, 30, 451, 651, 988, 310,1910,
|
||||
1911,1426, 810,1216, 93,1912,1913,1277,1217,1914, 858, 759, 45, 58, 181, 610,
|
||||
269,1915,1916, 131,1062, 551, 443,1000, 821,1427, 957, 895,1086,1917,1918, 375,
|
||||
1919, 359,1920, 687,1921, 822,1922, 293,1923,1924, 40, 662, 118, 692, 29, 939,
|
||||
887, 640, 482, 174,1925, 69,1162, 728,1428, 910,1926,1278,1218,1279, 386, 870,
|
||||
217, 854,1163, 823,1927,1928,1929,1930, 834,1931, 78,1932, 859,1933,1063,1934,
|
||||
1935,1936,1937, 438,1164, 208, 595,1938,1939,1940,1941,1219,1125,1942, 280, 888,
|
||||
1429,1430,1220,1431,1943,1944,1945,1946,1947,1280, 150, 510,1432,1948,1949,1950,
|
||||
1951,1952,1953,1954,1011,1087,1955,1433,1043,1956, 881,1957, 614, 958,1064,1065,
|
||||
1221,1958, 638,1001, 860, 967, 896,1434, 989, 492, 553,1281,1165,1959,1282,1002,
|
||||
1283,1222,1960,1961,1962,1963, 36, 383, 228, 753, 247, 454,1964, 876, 678,1965,
|
||||
1966,1284, 126, 464, 490, 835, 136, 672, 529, 940,1088,1435, 473,1967,1968, 467,
|
||||
50, 390, 227, 587, 279, 378, 598, 792, 968, 240, 151, 160, 849, 882,1126,1285,
|
||||
639,1044, 133, 140, 288, 360, 811, 563,1027, 561, 142, 523,1969,1970,1971, 7,
|
||||
103, 296, 439, 407, 506, 634, 990,1972,1973,1974,1975, 645,1976,1977,1978,1979,
|
||||
1980,1981, 236,1982,1436,1983,1984,1089, 192, 828, 618, 518,1166, 333,1127,1985,
|
||||
818,1223,1986,1987,1988,1989,1990,1991,1992,1993, 342,1128,1286, 746, 842,1994,
|
||||
1995, 560, 223,1287, 98, 8, 189, 650, 978,1288,1996,1437,1997, 17, 345, 250,
|
||||
423, 277, 234, 512, 226, 97, 289, 42, 167,1998, 201,1999,2000, 843, 836, 824,
|
||||
532, 338, 783,1090, 182, 576, 436,1438,1439, 527, 500,2001, 947, 889,2002,2003,
|
||||
2004,2005, 262, 600, 314, 447,2006, 547,2007, 693, 738,1129,2008, 71,1440, 745,
|
||||
619, 688,2009, 829,2010,2011, 147,2012, 33, 948,2013,2014, 74, 224,2015, 61,
|
||||
191, 918, 399, 637,2016,1028,1130, 257, 902,2017,2018,2019,2020,2021,2022,2023,
|
||||
2024,2025,2026, 837,2027,2028,2029,2030, 179, 874, 591, 52, 724, 246,2031,2032,
|
||||
2033,2034,1167, 969,2035,1289, 630, 605, 911,1091,1168,2036,2037,2038,1441, 912,
|
||||
2039, 623,2040,2041, 253,1169,1290,2042,1442, 146, 620, 611, 577, 433,2043,1224,
|
||||
719,1170, 959, 440, 437, 534, 84, 388, 480,1131, 159, 220, 198, 679,2044,1012,
|
||||
819,1066,1443, 113,1225, 194, 318,1003,1029,2045,2046,2047,2048,1067,2049,2050,
|
||||
2051,2052,2053, 59, 913, 112,2054, 632,2055, 455, 144, 739,1291,2056, 273, 681,
|
||||
499,2057, 448,2058,2059, 760,2060,2061, 970, 384, 169, 245,1132,2062,2063, 414,
|
||||
1444,2064,2065, 41, 235,2066, 157, 252, 877, 568, 919, 789, 580,2067, 725,2068,
|
||||
2069,1292,2070,2071,1445,2072,1446,2073,2074, 55, 588, 66,1447, 271,1092,2075,
|
||||
1226,2076, 960,1013, 372,2077,2078,2079,2080,2081,1293,2082,2083,2084,2085, 850,
|
||||
2086,2087,2088,2089,2090, 186,2091,1068, 180,2092,2093,2094, 109,1227, 522, 606,
|
||||
2095, 867,1448,1093, 991,1171, 926, 353,1133,2096, 581,2097,2098,2099,1294,1449,
|
||||
1450,2100, 596,1172,1014,1228,2101,1451,1295,1173,1229,2102,2103,1296,1134,1452,
|
||||
949,1135,2104,2105,1094,1453,1454,1455,2106,1095,2107,2108,2109,2110,2111,2112,
|
||||
2113,2114,2115,2116,2117, 804,2118,2119,1230,1231, 805,1456, 405,1136,2120,2121,
|
||||
2122,2123,2124, 720, 701,1297, 992,1457, 927,1004,2125,2126,2127,2128,2129,2130,
|
||||
22, 417,2131, 303,2132, 385,2133, 971, 520, 513,2134,1174, 73,1096, 231, 274,
|
||||
962,1458, 673,2135,1459,2136, 152,1137,2137,2138,2139,2140,1005,1138,1460,1139,
|
||||
2141,2142,2143,2144, 11, 374, 844,2145, 154,1232, 46,1461,2146, 838, 830, 721,
|
||||
1233, 106,2147, 90, 428, 462, 578, 566,1175, 352,2148,2149, 538,1234, 124,1298,
|
||||
2150,1462, 761, 565,2151, 686,2152, 649,2153, 72, 173,2154, 460, 415,2155,1463,
|
||||
2156,1235, 305,2157,2158,2159,2160,2161,2162, 579,2163,2164,2165,2166,2167, 747,
|
||||
2168,2169,2170,2171,1464, 669,2172,2173,2174,2175,2176,1465,2177, 23, 530, 285,
|
||||
2178, 335, 729,2179, 397,2180,2181,2182,1030,2183,2184, 698,2185,2186, 325,2187,
|
||||
2188, 369,2189, 799,1097,1015, 348,2190,1069, 680,2191, 851,1466,2192,2193, 10,
|
||||
2194, 613, 424,2195, 979, 108, 449, 589, 27, 172, 81,1031, 80, 774, 281, 350,
|
||||
1032, 525, 301, 582,1176,2196, 674,1045,2197,2198,1467, 730, 762,2199,2200,2201,
|
||||
2202,1468,2203, 993,2204,2205, 266,1070, 963,1140,2206,2207,2208, 664,1098, 972,
|
||||
2209,2210,2211,1177,1469,1470, 871,2212,2213,2214,2215,2216,1471,2217,2218,2219,
|
||||
2220,2221,2222,2223,2224,2225,2226,2227,1472,1236,2228,2229,2230,2231,2232,2233,
|
||||
2234,2235,1299,2236,2237, 200,2238, 477, 373,2239,2240, 731, 825, 777,2241,2242,
|
||||
2243, 521, 486, 548,2244,2245,2246,1473,1300, 53, 549, 137, 875, 76, 158,2247,
|
||||
1301,1474, 469, 396,1016, 278, 712,2248, 321, 442, 503, 767, 744, 941,1237,1178,
|
||||
1475,2249, 82, 178,1141,1179, 973,2250,1302,2251, 297,2252,2253, 570,2254,2255,
|
||||
2256, 18, 450, 206,2257, 290, 292,1142,2258, 511, 162, 99, 346, 164, 735,2259,
|
||||
1476,1477, 4, 554, 343, 798,1099,2260,1100,2261, 43, 171,1303, 139, 215,2262,
|
||||
2263, 717, 775,2264,1033, 322, 216,2265, 831,2266, 149,2267,1304,2268,2269, 702,
|
||||
1238, 135, 845, 347, 309,2270, 484,2271, 878, 655, 238,1006,1478,2272, 67,2273,
|
||||
295,2274,2275, 461,2276, 478, 942, 412,2277,1034,2278,2279,2280, 265,2281, 541,
|
||||
2282,2283,2284,2285,2286, 70, 852,1071,2287,2288,2289,2290, 21, 56, 509, 117,
|
||||
432,2291,2292, 331, 980, 552,1101, 148, 284, 105, 393,1180,1239, 755,2293, 187,
|
||||
2294,1046,1479,2295, 340,2296, 63,1047, 230,2297,2298,1305, 763,1306, 101, 800,
|
||||
808, 494,2299,2300,2301, 903,2302, 37,1072, 14, 5,2303, 79, 675,2304, 312,
|
||||
2305,2306,2307,2308,2309,1480, 6,1307,2310,2311,2312, 1, 470, 35, 24, 229,
|
||||
2313, 695, 210, 86, 778, 15, 784, 592, 779, 32, 77, 855, 964,2314, 259,2315,
|
||||
501, 380,2316,2317, 83, 981, 153, 689,1308,1481,1482,1483,2318,2319, 716,1484,
|
||||
2320,2321,2322,2323,2324,2325,1485,2326,2327, 128, 57, 68, 261,1048, 211, 170,
|
||||
1240, 31,2328, 51, 435, 742,2329,2330,2331, 635,2332, 264, 456,2333,2334,2335,
|
||||
425,2336,1486, 143, 507, 263, 943,2337, 363, 920,1487, 256,1488,1102, 243, 601,
|
||||
1489,2338,2339,2340,2341,2342,2343,2344, 861,2345,2346,2347,2348,2349,2350, 395,
|
||||
2351,1490,1491, 62, 535, 166, 225,2352,2353, 668, 419,1241, 138, 604, 928,2354,
|
||||
1181,2355,1492,1493,2356,2357,2358,1143,2359, 696,2360, 387, 307,1309, 682, 476,
|
||||
2361,2362, 332, 12, 222, 156,2363, 232,2364, 641, 276, 656, 517,1494,1495,1035,
|
||||
416, 736,1496,2365,1017, 586,2366,2367,2368,1497,2369, 242,2370,2371,2372,1498,
|
||||
2373, 965, 713,2374,2375,2376,2377, 740, 982,1499, 944,1500,1007,2378,2379,1310,
|
||||
1501,2380,2381,2382, 785, 329,2383,2384,1502,2385,2386,2387, 932,2388,1503,2389,
|
||||
2390,2391,2392,1242,2393,2394,2395,2396,2397, 994, 950,2398,2399,2400,2401,1504,
|
||||
1311,2402,2403,2404,2405,1049, 749,2406,2407, 853, 718,1144,1312,2408,1182,1505,
|
||||
2409,2410, 255, 516, 479, 564, 550, 214,1506,1507,1313, 413, 239, 444, 339,1145,
|
||||
1036,1508,1509,1314,1037,1510,1315,2411,1511,2412,2413,2414, 176, 703, 497, 624,
|
||||
593, 921, 302,2415, 341, 165,1103,1512,2416,1513,2417,2418,2419, 376,2420, 700,
|
||||
2421,2422,2423, 258, 768,1316,2424,1183,2425, 995, 608,2426,2427,2428,2429, 221,
|
||||
2430,2431,2432,2433,2434,2435,2436,2437, 195, 323, 726, 188, 897, 983,1317, 377,
|
||||
644,1050, 879,2438, 452,2439,2440,2441,2442,2443,2444, 914,2445,2446,2447,2448,
|
||||
915, 489,2449,1514,1184,2450,2451, 515, 64, 427, 495,2452, 583,2453, 483, 485,
|
||||
1038, 562, 213,1515, 748, 666,2454,2455,2456,2457, 334,2458, 780, 996,1008, 705,
|
||||
1243,2459,2460,2461,2462,2463, 114,2464, 493,1146, 366, 163,1516, 961,1104,2465,
|
||||
291,2466,1318,1105,2467,1517, 365,2468, 355, 951,1244,2469,1319,2470, 631,2471,
|
||||
2472, 218,1320, 364, 320, 756,1518,1519,1321,1520,1322,2473,2474,2475,2476, 997,
|
||||
2477,2478,2479,2480, 665,1185,2481, 916,1521,2482,2483,2484, 584, 684,2485,2486,
|
||||
797,2487,1051,1186,2488,2489,2490,1522,2491,2492, 370,2493,1039,1187, 65,2494,
|
||||
434, 205, 463,1188,2495, 125, 812, 391, 402, 826, 699, 286, 398, 155, 781, 771,
|
||||
585,2496, 590, 505,1073,2497, 599, 244, 219, 917,1018, 952, 646,1523,2498,1323,
|
||||
2499,2500, 49, 984, 354, 741,2501, 625,2502,1324,2503,1019, 190, 357, 757, 491,
|
||||
95, 782, 868,2504,2505,2506,2507,2508,2509, 134,1524,1074, 422,1525, 898,2510,
|
||||
161,2511,2512,2513,2514, 769,2515,1526,2516,2517, 411,1325,2518, 472,1527,2519,
|
||||
2520,2521,2522,2523,2524, 985,2525,2526,2527,2528,2529,2530, 764,2531,1245,2532,
|
||||
2533, 25, 204, 311,2534, 496,2535,1052,2536,2537,2538,2539,2540,2541,2542, 199,
|
||||
704, 504, 468, 758, 657,1528, 196, 44, 839,1246, 272, 750,2543, 765, 862,2544,
|
||||
2545,1326,2546, 132, 615, 933,2547, 732,2548,2549,2550,1189,1529,2551, 283,1247,
|
||||
1053, 607, 929,2552,2553,2554, 930, 183, 872, 616,1040,1147,2555,1148,1020, 441,
|
||||
249,1075,2556,2557,2558, 466, 743,2559,2560,2561, 92, 514, 426, 420, 526,2562,
|
||||
2563,2564,2565,2566,2567,2568, 185,2569,2570,2571,2572, 776,1530, 658,2573, 362,
|
||||
2574, 361, 922,1076, 793,2575,2576,2577,2578,2579,2580,1531, 251,2581,2582,2583,
|
||||
2584,1532, 54, 612, 237,1327,2585,2586, 275, 408, 647, 111,2587,1533,1106, 465,
|
||||
3, 458, 9, 38,2588, 107, 110, 890, 209, 26, 737, 498,2589,1534,2590, 431,
|
||||
202, 88,1535, 356, 287,1107, 660,1149,2591, 381,1536, 986,1150, 445,1248,1151,
|
||||
974,2592,2593, 846,2594, 446, 953, 184,1249,1250, 727,2595, 923, 193, 883,2596,
|
||||
2597,2598, 102, 324, 539, 817,2599, 421,1041,2600, 832,2601, 94, 175, 197, 406,
|
||||
2602, 459,2603,2604,2605,2606,2607, 330, 555,2608,2609,2610, 706,1108, 389,2611,
|
||||
2612,2613,2614, 233,2615, 833, 558, 931, 954,1251,2616,2617,1537, 546,2618,2619,
|
||||
1009,2620,2621,2622,1538, 690,1328,2623, 955,2624,1539,2625,2626, 772,2627,2628,
|
||||
2629,2630,2631, 924, 648, 863, 603,2632,2633, 934,1540, 864, 865,2634, 642,1042,
|
||||
670,1190,2635,2636,2637,2638, 168,2639, 652, 873, 542,1054,1541,2640,2641,2642, # 512, 256
|
||||
)
|
||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,91 +0,0 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
# Shy Shalom - original C code
|
||||
# Proofpoint, Inc.
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import ProbingState, MachineState
|
||||
|
||||
|
||||
class MultiByteCharSetProber(CharSetProber):
|
||||
"""
|
||||
MultiByteCharSetProber
|
||||
"""
|
||||
|
||||
def __init__(self, lang_filter=None):
|
||||
super(MultiByteCharSetProber, self).__init__(lang_filter=lang_filter)
|
||||
self.distribution_analyzer = None
|
||||
self.coding_sm = None
|
||||
self._last_char = [0, 0]
|
||||
|
||||
def reset(self):
|
||||
super(MultiByteCharSetProber, self).reset()
|
||||
if self.coding_sm:
|
||||
self.coding_sm.reset()
|
||||
if self.distribution_analyzer:
|
||||
self.distribution_analyzer.reset()
|
||||
self._last_char = [0, 0]
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def feed(self, byte_str):
|
||||
for i in range(len(byte_str)):
|
||||
coding_state = self.coding_sm.next_state(byte_str[i])
|
||||
if coding_state == MachineState.ERROR:
|
||||
self.logger.debug('%s %s prober hit error at byte %s',
|
||||
self.charset_name, self.language, i)
|
||||
self._state = ProbingState.NOT_ME
|
||||
break
|
||||
elif coding_state == MachineState.ITS_ME:
|
||||
self._state = ProbingState.FOUND_IT
|
||||
break
|
||||
elif coding_state == MachineState.START:
|
||||
char_len = self.coding_sm.get_current_charlen()
|
||||
if i == 0:
|
||||
self._last_char[1] = byte_str[0]
|
||||
self.distribution_analyzer.feed(self._last_char, char_len)
|
||||
else:
|
||||
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
|
||||
char_len)
|
||||
|
||||
self._last_char[0] = byte_str[-1]
|
||||
|
||||
if self.state == ProbingState.DETECTING:
|
||||
if (self.distribution_analyzer.got_enough_data() and
|
||||
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
||||
self._state = ProbingState.FOUND_IT
|
||||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
return self.distribution_analyzer.get_confidence()
|
|
@ -1,572 +0,0 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is mozilla.org code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 1998
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .enums import MachineState
|
||||
|
||||
# BIG5
|
||||
|
||||
BIG5_CLS = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
1,1,1,1,1,1,1,1, # 30 - 37
|
||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
||||
2,2,2,2,2,2,2,2, # 40 - 47
|
||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
||||
2,2,2,2,2,2,2,2, # 50 - 57
|
||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
||||
2,2,2,2,2,2,2,2, # 60 - 67
|
||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,1, # 78 - 7f
|
||||
4,4,4,4,4,4,4,4, # 80 - 87
|
||||
4,4,4,4,4,4,4,4, # 88 - 8f
|
||||
4,4,4,4,4,4,4,4, # 90 - 97
|
||||
4,4,4,4,4,4,4,4, # 98 - 9f
|
||||
4,3,3,3,3,3,3,3, # a0 - a7
|
||||
3,3,3,3,3,3,3,3, # a8 - af
|
||||
3,3,3,3,3,3,3,3, # b0 - b7
|
||||
3,3,3,3,3,3,3,3, # b8 - bf
|
||||
3,3,3,3,3,3,3,3, # c0 - c7
|
||||
3,3,3,3,3,3,3,3, # c8 - cf
|
||||
3,3,3,3,3,3,3,3, # d0 - d7
|
||||
3,3,3,3,3,3,3,3, # d8 - df
|
||||
3,3,3,3,3,3,3,3, # e0 - e7
|
||||
3,3,3,3,3,3,3,3, # e8 - ef
|
||||
3,3,3,3,3,3,3,3, # f0 - f7
|
||||
3,3,3,3,3,3,3,0 # f8 - ff
|
||||
)
|
||||
|
||||
BIG5_ST = (
|
||||
MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,#08-0f
|
||||
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START#10-17
|
||||
)
|
||||
|
||||
BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
|
||||
|
||||
BIG5_SM_MODEL = {'class_table': BIG5_CLS,
|
||||
'class_factor': 5,
|
||||
'state_table': BIG5_ST,
|
||||
'char_len_table': BIG5_CHAR_LEN_TABLE,
|
||||
'name': 'Big5'}
|
||||
|
||||
# CP949
|
||||
|
||||
CP949_CLS = (
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f
|
||||
1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f
|
||||
1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f
|
||||
4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f
|
||||
1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f
|
||||
5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f
|
||||
0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f
|
||||
6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f
|
||||
6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af
|
||||
7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf
|
||||
7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf
|
||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df
|
||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef
|
||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff
|
||||
)
|
||||
|
||||
CP949_ST = (
|
||||
#cls= 0 1 2 3 4 5 6 7 8 9 # previous state =
|
||||
MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START, 4, 5,MachineState.ERROR, 6, # MachineState.START
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, # MachineState.ERROR
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # MachineState.ITS_ME
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 3
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 4
|
||||
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 5
|
||||
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 6
|
||||
)
|
||||
|
||||
CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
|
||||
|
||||
CP949_SM_MODEL = {'class_table': CP949_CLS,
|
||||
'class_factor': 10,
|
||||
'state_table': CP949_ST,
|
||||
'char_len_table': CP949_CHAR_LEN_TABLE,
|
||||
'name': 'CP949'}
|
||||
|
||||
# EUC-JP
|
||||
|
||||
EUCJP_CLS = (
|
||||
4,4,4,4,4,4,4,4, # 00 - 07
|
||||
4,4,4,4,4,4,5,5, # 08 - 0f
|
||||
4,4,4,4,4,4,4,4, # 10 - 17
|
||||
4,4,4,5,4,4,4,4, # 18 - 1f
|
||||
4,4,4,4,4,4,4,4, # 20 - 27
|
||||
4,4,4,4,4,4,4,4, # 28 - 2f
|
||||
4,4,4,4,4,4,4,4, # 30 - 37
|
||||
4,4,4,4,4,4,4,4, # 38 - 3f
|
||||
4,4,4,4,4,4,4,4, # 40 - 47
|
||||
4,4,4,4,4,4,4,4, # 48 - 4f
|
||||
4,4,4,4,4,4,4,4, # 50 - 57
|
||||
4,4,4,4,4,4,4,4, # 58 - 5f
|
||||
4,4,4,4,4,4,4,4, # 60 - 67
|
||||
4,4,4,4,4,4,4,4, # 68 - 6f
|
||||
4,4,4,4,4,4,4,4, # 70 - 77
|
||||
4,4,4,4,4,4,4,4, # 78 - 7f
|
||||
5,5,5,5,5,5,5,5, # 80 - 87
|
||||
5,5,5,5,5,5,1,3, # 88 - 8f
|
||||
5,5,5,5,5,5,5,5, # 90 - 97
|
||||
5,5,5,5,5,5,5,5, # 98 - 9f
|
||||
5,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
0,0,0,0,0,0,0,0, # e0 - e7
|
||||
0,0,0,0,0,0,0,0, # e8 - ef
|
||||
0,0,0,0,0,0,0,0, # f0 - f7
|
||||
0,0,0,0,0,0,0,5 # f8 - ff
|
||||
)
|
||||
|
||||
EUCJP_ST = (
|
||||
3, 4, 3, 5,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#10-17
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,#18-1f
|
||||
3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START#20-27
|
||||
)
|
||||
|
||||
EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
|
||||
|
||||
EUCJP_SM_MODEL = {'class_table': EUCJP_CLS,
|
||||
'class_factor': 6,
|
||||
'state_table': EUCJP_ST,
|
||||
'char_len_table': EUCJP_CHAR_LEN_TABLE,
|
||||
'name': 'EUC-JP'}
|
||||
|
||||
# EUC-KR
|
||||
|
||||
EUCKR_CLS = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
1,1,1,1,1,1,1,1, # 30 - 37
|
||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
||||
1,1,1,1,1,1,1,1, # 40 - 47
|
||||
1,1,1,1,1,1,1,1, # 48 - 4f
|
||||
1,1,1,1,1,1,1,1, # 50 - 57
|
||||
1,1,1,1,1,1,1,1, # 58 - 5f
|
||||
1,1,1,1,1,1,1,1, # 60 - 67
|
||||
1,1,1,1,1,1,1,1, # 68 - 6f
|
||||
1,1,1,1,1,1,1,1, # 70 - 77
|
||||
1,1,1,1,1,1,1,1, # 78 - 7f
|
||||
0,0,0,0,0,0,0,0, # 80 - 87
|
||||
0,0,0,0,0,0,0,0, # 88 - 8f
|
||||
0,0,0,0,0,0,0,0, # 90 - 97
|
||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
||||
0,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,3,3,3, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,3,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
2,2,2,2,2,2,2,2, # e0 - e7
|
||||
2,2,2,2,2,2,2,2, # e8 - ef
|
||||
2,2,2,2,2,2,2,2, # f0 - f7
|
||||
2,2,2,2,2,2,2,0 # f8 - ff
|
||||
)
|
||||
|
||||
EUCKR_ST = (
|
||||
MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #08-0f
|
||||
)
|
||||
|
||||
EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
|
||||
|
||||
EUCKR_SM_MODEL = {'class_table': EUCKR_CLS,
|
||||
'class_factor': 4,
|
||||
'state_table': EUCKR_ST,
|
||||
'char_len_table': EUCKR_CHAR_LEN_TABLE,
|
||||
'name': 'EUC-KR'}
|
||||
|
||||
# EUC-TW
|
||||
|
||||
EUCTW_CLS = (
|
||||
2,2,2,2,2,2,2,2, # 00 - 07
|
||||
2,2,2,2,2,2,0,0, # 08 - 0f
|
||||
2,2,2,2,2,2,2,2, # 10 - 17
|
||||
2,2,2,0,2,2,2,2, # 18 - 1f
|
||||
2,2,2,2,2,2,2,2, # 20 - 27
|
||||
2,2,2,2,2,2,2,2, # 28 - 2f
|
||||
2,2,2,2,2,2,2,2, # 30 - 37
|
||||
2,2,2,2,2,2,2,2, # 38 - 3f
|
||||
2,2,2,2,2,2,2,2, # 40 - 47
|
||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
||||
2,2,2,2,2,2,2,2, # 50 - 57
|
||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
||||
2,2,2,2,2,2,2,2, # 60 - 67
|
||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,2, # 78 - 7f
|
||||
0,0,0,0,0,0,0,0, # 80 - 87
|
||||
0,0,0,0,0,0,6,0, # 88 - 8f
|
||||
0,0,0,0,0,0,0,0, # 90 - 97
|
||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
||||
0,3,4,4,4,4,4,4, # a0 - a7
|
||||
5,5,1,1,1,1,1,1, # a8 - af
|
||||
1,1,1,1,1,1,1,1, # b0 - b7
|
||||
1,1,1,1,1,1,1,1, # b8 - bf
|
||||
1,1,3,1,3,3,3,3, # c0 - c7
|
||||
3,3,3,3,3,3,3,3, # c8 - cf
|
||||
3,3,3,3,3,3,3,3, # d0 - d7
|
||||
3,3,3,3,3,3,3,3, # d8 - df
|
||||
3,3,3,3,3,3,3,3, # e0 - e7
|
||||
3,3,3,3,3,3,3,3, # e8 - ef
|
||||
3,3,3,3,3,3,3,3, # f0 - f7
|
||||
3,3,3,3,3,3,3,0 # f8 - ff
|
||||
)
|
||||
|
||||
EUCTW_ST = (
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.START, 3, 3, 3, 4,MachineState.ERROR,#00-07
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.ERROR,#10-17
|
||||
MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
|
||||
5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,#20-27
|
||||
MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
|
||||
)
|
||||
|
||||
EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
|
||||
|
||||
EUCTW_SM_MODEL = {'class_table': EUCTW_CLS,
|
||||
'class_factor': 7,
|
||||
'state_table': EUCTW_ST,
|
||||
'char_len_table': EUCTW_CHAR_LEN_TABLE,
|
||||
'name': 'x-euc-tw'}
|
||||
|
||||
# GB2312
|
||||
|
||||
GB2312_CLS = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
3,3,3,3,3,3,3,3, # 30 - 37
|
||||
3,3,1,1,1,1,1,1, # 38 - 3f
|
||||
2,2,2,2,2,2,2,2, # 40 - 47
|
||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
||||
2,2,2,2,2,2,2,2, # 50 - 57
|
||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
||||
2,2,2,2,2,2,2,2, # 60 - 67
|
||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,4, # 78 - 7f
|
||||
5,6,6,6,6,6,6,6, # 80 - 87
|
||||
6,6,6,6,6,6,6,6, # 88 - 8f
|
||||
6,6,6,6,6,6,6,6, # 90 - 97
|
||||
6,6,6,6,6,6,6,6, # 98 - 9f
|
||||
6,6,6,6,6,6,6,6, # a0 - a7
|
||||
6,6,6,6,6,6,6,6, # a8 - af
|
||||
6,6,6,6,6,6,6,6, # b0 - b7
|
||||
6,6,6,6,6,6,6,6, # b8 - bf
|
||||
6,6,6,6,6,6,6,6, # c0 - c7
|
||||
6,6,6,6,6,6,6,6, # c8 - cf
|
||||
6,6,6,6,6,6,6,6, # d0 - d7
|
||||
6,6,6,6,6,6,6,6, # d8 - df
|
||||
6,6,6,6,6,6,6,6, # e0 - e7
|
||||
6,6,6,6,6,6,6,6, # e8 - ef
|
||||
6,6,6,6,6,6,6,6, # f0 - f7
|
||||
6,6,6,6,6,6,6,0 # f8 - ff
|
||||
)
|
||||
|
||||
GB2312_ST = (
|
||||
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, 3,MachineState.ERROR,#00-07
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,#10-17
|
||||
4,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
|
||||
MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#20-27
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
|
||||
)
|
||||
|
||||
# To be accurate, the length of class 6 can be either 2 or 4.
|
||||
# But it is not necessary to discriminate between the two since
|
||||
# it is used for frequency analysis only, and we are validating
|
||||
# each code range there as well. So it is safe to set it to be
|
||||
# 2 here.
|
||||
GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2)
|
||||
|
||||
GB2312_SM_MODEL = {'class_table': GB2312_CLS,
|
||||
'class_factor': 7,
|
||||
'state_table': GB2312_ST,
|
||||
'char_len_table': GB2312_CHAR_LEN_TABLE,
|
||||
'name': 'GB2312'}
|
||||
|
||||
# Shift_JIS
|
||||
|
||||
SJIS_CLS = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
1,1,1,1,1,1,1,1, # 30 - 37
|
||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
||||
2,2,2,2,2,2,2,2, # 40 - 47
|
||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
||||
2,2,2,2,2,2,2,2, # 50 - 57
|
||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
||||
2,2,2,2,2,2,2,2, # 60 - 67
|
||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,1, # 78 - 7f
|
||||
3,3,3,3,3,2,2,3, # 80 - 87
|
||||
3,3,3,3,3,3,3,3, # 88 - 8f
|
||||
3,3,3,3,3,3,3,3, # 90 - 97
|
||||
3,3,3,3,3,3,3,3, # 98 - 9f
|
||||
#0xa0 is illegal in sjis encoding, but some pages does
|
||||
#contain such byte. We need to be more error forgiven.
|
||||
2,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
3,3,3,3,3,3,3,3, # e0 - e7
|
||||
3,3,3,3,3,4,4,4, # e8 - ef
|
||||
3,3,3,3,3,3,3,3, # f0 - f7
|
||||
3,3,3,3,3,0,0,0) # f8 - ff
|
||||
|
||||
|
||||
SJIS_ST = (
|
||||
MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START #10-17
|
||||
)
|
||||
|
||||
SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
|
||||
|
||||
SJIS_SM_MODEL = {'class_table': SJIS_CLS,
|
||||
'class_factor': 6,
|
||||
'state_table': SJIS_ST,
|
||||
'char_len_table': SJIS_CHAR_LEN_TABLE,
|
||||
'name': 'Shift_JIS'}
|
||||
|
||||
# UCS2-BE
|
||||
|
||||
UCS2BE_CLS = (
|
||||
0,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,1,0,0,2,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,3,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,0,0,0,0, # 20 - 27
|
||||
0,3,3,3,3,3,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,0,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
0,0,0,0,0,0,0,0, # 80 - 87
|
||||
0,0,0,0,0,0,0,0, # 88 - 8f
|
||||
0,0,0,0,0,0,0,0, # 90 - 97
|
||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
||||
0,0,0,0,0,0,0,0, # a0 - a7
|
||||
0,0,0,0,0,0,0,0, # a8 - af
|
||||
0,0,0,0,0,0,0,0, # b0 - b7
|
||||
0,0,0,0,0,0,0,0, # b8 - bf
|
||||
0,0,0,0,0,0,0,0, # c0 - c7
|
||||
0,0,0,0,0,0,0,0, # c8 - cf
|
||||
0,0,0,0,0,0,0,0, # d0 - d7
|
||||
0,0,0,0,0,0,0,0, # d8 - df
|
||||
0,0,0,0,0,0,0,0, # e0 - e7
|
||||
0,0,0,0,0,0,0,0, # e8 - ef
|
||||
0,0,0,0,0,0,0,0, # f0 - f7
|
||||
0,0,0,0,0,0,4,5 # f8 - ff
|
||||
)
|
||||
|
||||
UCS2BE_ST = (
|
||||
5, 7, 7,MachineState.ERROR, 4, 3,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
|
||||
MachineState.ITS_ME,MachineState.ITS_ME, 6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,#10-17
|
||||
6, 6, 6, 6, 6,MachineState.ITS_ME, 6, 6,#18-1f
|
||||
6, 6, 6, 6, 5, 7, 7,MachineState.ERROR,#20-27
|
||||
5, 8, 6, 6,MachineState.ERROR, 6, 6, 6,#28-2f
|
||||
6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #30-37
|
||||
)
|
||||
|
||||
UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
|
||||
|
||||
UCS2BE_SM_MODEL = {'class_table': UCS2BE_CLS,
|
||||
'class_factor': 6,
|
||||
'state_table': UCS2BE_ST,
|
||||
'char_len_table': UCS2BE_CHAR_LEN_TABLE,
|
||||
'name': 'UTF-16BE'}
|
||||
|
||||
# UCS2-LE
|
||||
|
||||
UCS2LE_CLS = (
|
||||
0,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,1,0,0,2,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,3,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,0,0,0,0, # 20 - 27
|
||||
0,3,3,3,3,3,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,0,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
0,0,0,0,0,0,0,0, # 80 - 87
|
||||
0,0,0,0,0,0,0,0, # 88 - 8f
|
||||
0,0,0,0,0,0,0,0, # 90 - 97
|
||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
||||
0,0,0,0,0,0,0,0, # a0 - a7
|
||||
0,0,0,0,0,0,0,0, # a8 - af
|
||||
0,0,0,0,0,0,0,0, # b0 - b7
|
||||
0,0,0,0,0,0,0,0, # b8 - bf
|
||||
0,0,0,0,0,0,0,0, # c0 - c7
|
||||
0,0,0,0,0,0,0,0, # c8 - cf
|
||||
0,0,0,0,0,0,0,0, # d0 - d7
|
||||
0,0,0,0,0,0,0,0, # d8 - df
|
||||
0,0,0,0,0,0,0,0, # e0 - e7
|
||||
0,0,0,0,0,0,0,0, # e8 - ef
|
||||
0,0,0,0,0,0,0,0, # f0 - f7
|
||||
0,0,0,0,0,0,4,5 # f8 - ff
|
||||
)
|
||||
|
||||
UCS2LE_ST = (
|
||||
6, 6, 7, 6, 4, 3,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
|
||||
MachineState.ITS_ME,MachineState.ITS_ME, 5, 5, 5,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#10-17
|
||||
5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR, 6, 6,#18-1f
|
||||
7, 6, 8, 8, 5, 5, 5,MachineState.ERROR,#20-27
|
||||
5, 5, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5,#28-2f
|
||||
5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR,MachineState.START,MachineState.START #30-37
|
||||
)
|
||||
|
||||
UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
|
||||
|
||||
UCS2LE_SM_MODEL = {'class_table': UCS2LE_CLS,
|
||||
'class_factor': 6,
|
||||
'state_table': UCS2LE_ST,
|
||||
'char_len_table': UCS2LE_CHAR_LEN_TABLE,
|
||||
'name': 'UTF-16LE'}
|
||||
|
||||
# UTF-8
|
||||
|
||||
UTF8_CLS = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
1,1,1,1,1,1,1,1, # 30 - 37
|
||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
||||
1,1,1,1,1,1,1,1, # 40 - 47
|
||||
1,1,1,1,1,1,1,1, # 48 - 4f
|
||||
1,1,1,1,1,1,1,1, # 50 - 57
|
||||
1,1,1,1,1,1,1,1, # 58 - 5f
|
||||
1,1,1,1,1,1,1,1, # 60 - 67
|
||||
1,1,1,1,1,1,1,1, # 68 - 6f
|
||||
1,1,1,1,1,1,1,1, # 70 - 77
|
||||
1,1,1,1,1,1,1,1, # 78 - 7f
|
||||
2,2,2,2,3,3,3,3, # 80 - 87
|
||||
4,4,4,4,4,4,4,4, # 88 - 8f
|
||||
4,4,4,4,4,4,4,4, # 90 - 97
|
||||
4,4,4,4,4,4,4,4, # 98 - 9f
|
||||
5,5,5,5,5,5,5,5, # a0 - a7
|
||||
5,5,5,5,5,5,5,5, # a8 - af
|
||||
5,5,5,5,5,5,5,5, # b0 - b7
|
||||
5,5,5,5,5,5,5,5, # b8 - bf
|
||||
0,0,6,6,6,6,6,6, # c0 - c7
|
||||
6,6,6,6,6,6,6,6, # c8 - cf
|
||||
6,6,6,6,6,6,6,6, # d0 - d7
|
||||
6,6,6,6,6,6,6,6, # d8 - df
|
||||
7,8,8,8,8,8,8,8, # e0 - e7
|
||||
8,8,8,8,8,9,8,8, # e8 - ef
|
||||
10,11,11,11,11,11,11,11, # f0 - f7
|
||||
12,13,13,13,14,15,0,0 # f8 - ff
|
||||
)
|
||||
|
||||
UTF8_ST = (
|
||||
MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 12, 10,#00-07
|
||||
9, 11, 8, 7, 6, 5, 4, 3,#08-0f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#10-17
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#18-1f
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#20-27
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#28-2f
|
||||
MachineState.ERROR,MachineState.ERROR, 5, 5, 5, 5,MachineState.ERROR,MachineState.ERROR,#30-37
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#38-3f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5, 5,MachineState.ERROR,MachineState.ERROR,#40-47
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#48-4f
|
||||
MachineState.ERROR,MachineState.ERROR, 7, 7, 7, 7,MachineState.ERROR,MachineState.ERROR,#50-57
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#58-5f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 7, 7,MachineState.ERROR,MachineState.ERROR,#60-67
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#68-6f
|
||||
MachineState.ERROR,MachineState.ERROR, 9, 9, 9, 9,MachineState.ERROR,MachineState.ERROR,#70-77
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#78-7f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 9,MachineState.ERROR,MachineState.ERROR,#80-87
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#88-8f
|
||||
MachineState.ERROR,MachineState.ERROR, 12, 12, 12, 12,MachineState.ERROR,MachineState.ERROR,#90-97
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#98-9f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 12,MachineState.ERROR,MachineState.ERROR,#a0-a7
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#a8-af
|
||||
MachineState.ERROR,MachineState.ERROR, 12, 12, 12,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#b0-b7
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#b8-bf
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,#c0-c7
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR #c8-cf
|
||||
)
|
||||
|
||||
UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
|
||||
|
||||
UTF8_SM_MODEL = {'class_table': UTF8_CLS,
|
||||
'class_factor': 16,
|
||||
'state_table': UTF8_ST,
|
||||
'char_len_table': UTF8_CHAR_LEN_TABLE,
|
||||
'name': 'UTF-8'}
|
|
@ -1,310 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Metadata about languages used by our model training code for our
|
||||
SingleByteCharSetProbers. Could be used for other things in the future.
|
||||
|
||||
This code is based on the language metadata from the uchardet project.
|
||||
"""
|
||||
from __future__ import absolute_import, print_function
|
||||
|
||||
from string import ascii_letters
|
||||
|
||||
|
||||
# TODO: Add Ukranian (KOI8-U)
|
||||
|
||||
class Language(object):
|
||||
"""Metadata about a language useful for training models
|
||||
|
||||
:ivar name: The human name for the language, in English.
|
||||
:type name: str
|
||||
:ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
|
||||
or use another catalog as a last resort.
|
||||
:type iso_code: str
|
||||
:ivar use_ascii: Whether or not ASCII letters should be included in trained
|
||||
models.
|
||||
:type use_ascii: bool
|
||||
:ivar charsets: The charsets we want to support and create data for.
|
||||
:type charsets: list of str
|
||||
:ivar alphabet: The characters in the language's alphabet. If `use_ascii` is
|
||||
`True`, you only need to add those not in the ASCII set.
|
||||
:type alphabet: str
|
||||
:ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling
|
||||
Wikipedia for training data.
|
||||
:type wiki_start_pages: list of str
|
||||
"""
|
||||
def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None,
|
||||
alphabet=None, wiki_start_pages=None):
|
||||
super(Language, self).__init__()
|
||||
self.name = name
|
||||
self.iso_code = iso_code
|
||||
self.use_ascii = use_ascii
|
||||
self.charsets = charsets
|
||||
if self.use_ascii:
|
||||
if alphabet:
|
||||
alphabet += ascii_letters
|
||||
else:
|
||||
alphabet = ascii_letters
|
||||
elif not alphabet:
|
||||
raise ValueError('Must supply alphabet if use_ascii is False')
|
||||
self.alphabet = ''.join(sorted(set(alphabet))) if alphabet else None
|
||||
self.wiki_start_pages = wiki_start_pages
|
||||
|
||||
def __repr__(self):
|
||||
return '{}({})'.format(self.__class__.__name__,
|
||||
', '.join('{}={!r}'.format(k, v)
|
||||
for k, v in self.__dict__.items()
|
||||
if not k.startswith('_')))
|
||||
|
||||
|
||||
LANGUAGES = {'Arabic': Language(name='Arabic',
|
||||
iso_code='ar',
|
||||
use_ascii=False,
|
||||
# We only support encodings that use isolated
|
||||
# forms, because the current recommendation is
|
||||
# that the rendering system handles presentation
|
||||
# forms. This means we purposefully skip IBM864.
|
||||
charsets=['ISO-8859-6', 'WINDOWS-1256',
|
||||
'CP720', 'CP864'],
|
||||
alphabet=u'ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ',
|
||||
wiki_start_pages=[u'الصفحة_الرئيسية']),
|
||||
'Belarusian': Language(name='Belarusian',
|
||||
iso_code='be',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||
'IBM866', 'MacCyrillic'],
|
||||
alphabet=(u'АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ'
|
||||
u'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'),
|
||||
wiki_start_pages=[u'Галоўная_старонка']),
|
||||
'Bulgarian': Language(name='Bulgarian',
|
||||
iso_code='bg',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||
'IBM855'],
|
||||
alphabet=(u'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ'
|
||||
u'абвгдежзийклмнопрстуфхцчшщъьюя'),
|
||||
wiki_start_pages=[u'Начална_страница']),
|
||||
'Czech': Language(name='Czech',
|
||||
iso_code='cz',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=u'áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ',
|
||||
wiki_start_pages=[u'Hlavní_strana']),
|
||||
'Danish': Language(name='Danish',
|
||||
iso_code='da',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'æøåÆØÅ',
|
||||
wiki_start_pages=[u'Forside']),
|
||||
'German': Language(name='German',
|
||||
iso_code='de',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
||||
alphabet=u'äöüßÄÖÜ',
|
||||
wiki_start_pages=[u'Wikipedia:Hauptseite']),
|
||||
'Greek': Language(name='Greek',
|
||||
iso_code='el',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-7', 'WINDOWS-1253'],
|
||||
alphabet=(u'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ'
|
||||
u'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'),
|
||||
wiki_start_pages=[u'Πύλη:Κύρια']),
|
||||
'English': Language(name='English',
|
||||
iso_code='en',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
||||
wiki_start_pages=[u'Main_Page']),
|
||||
'Esperanto': Language(name='Esperanto',
|
||||
iso_code='eo',
|
||||
# Q, W, X, and Y not used at all
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-3'],
|
||||
alphabet=(u'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz'
|
||||
u'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'),
|
||||
wiki_start_pages=[u'Vikipedio:Ĉefpaĝo']),
|
||||
'Spanish': Language(name='Spanish',
|
||||
iso_code='es',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'ñáéíóúüÑÁÉÍÓÚÜ',
|
||||
wiki_start_pages=[u'Wikipedia:Portada']),
|
||||
'Estonian': Language(name='Estonian',
|
||||
iso_code='et',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-4', 'ISO-8859-13',
|
||||
'WINDOWS-1257'],
|
||||
# C, F, Š, Q, W, X, Y, Z, Ž are only for
|
||||
# loanwords
|
||||
alphabet=(u'ABDEGHIJKLMNOPRSTUVÕÄÖÜ'
|
||||
u'abdeghijklmnoprstuvõäöü'),
|
||||
wiki_start_pages=[u'Esileht']),
|
||||
'Finnish': Language(name='Finnish',
|
||||
iso_code='fi',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'ÅÄÖŠŽåäöšž',
|
||||
wiki_start_pages=[u'Wikipedia:Etusivu']),
|
||||
'French': Language(name='French',
|
||||
iso_code='fr',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ',
|
||||
wiki_start_pages=[u'Wikipédia:Accueil_principal',
|
||||
u'Bœuf (animal)']),
|
||||
'Hebrew': Language(name='Hebrew',
|
||||
iso_code='he',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-8', 'WINDOWS-1255'],
|
||||
alphabet=u'אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ',
|
||||
wiki_start_pages=[u'עמוד_ראשי']),
|
||||
'Croatian': Language(name='Croatian',
|
||||
iso_code='hr',
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=(u'abcčćdđefghijklmnoprsštuvzž'
|
||||
u'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'),
|
||||
wiki_start_pages=[u'Glavna_stranica']),
|
||||
'Hungarian': Language(name='Hungarian',
|
||||
iso_code='hu',
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=(u'abcdefghijklmnoprstuvzáéíóöőúüű'
|
||||
u'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'),
|
||||
wiki_start_pages=[u'Kezdőlap']),
|
||||
'Italian': Language(name='Italian',
|
||||
iso_code='it',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'ÀÈÉÌÒÓÙàèéìòóù',
|
||||
wiki_start_pages=[u'Pagina_principale']),
|
||||
'Lithuanian': Language(name='Lithuanian',
|
||||
iso_code='lt',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-13', 'WINDOWS-1257',
|
||||
'ISO-8859-4'],
|
||||
# Q, W, and X not used at all
|
||||
alphabet=(u'AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ'
|
||||
u'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'),
|
||||
wiki_start_pages=[u'Pagrindinis_puslapis']),
|
||||
'Latvian': Language(name='Latvian',
|
||||
iso_code='lv',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-13', 'WINDOWS-1257',
|
||||
'ISO-8859-4'],
|
||||
# Q, W, X, Y are only for loanwords
|
||||
alphabet=(u'AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ'
|
||||
u'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'),
|
||||
wiki_start_pages=[u'Sākumlapa']),
|
||||
'Macedonian': Language(name='Macedonian',
|
||||
iso_code='mk',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||
'MacCyrillic', 'IBM855'],
|
||||
alphabet=(u'АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ'
|
||||
u'абвгдѓежзѕијклљмнњопрстќуфхцчџш'),
|
||||
wiki_start_pages=[u'Главна_страница']),
|
||||
'Dutch': Language(name='Dutch',
|
||||
iso_code='nl',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
||||
wiki_start_pages=[u'Hoofdpagina']),
|
||||
'Polish': Language(name='Polish',
|
||||
iso_code='pl',
|
||||
# Q and X are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=(u'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ'
|
||||
u'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'),
|
||||
wiki_start_pages=[u'Wikipedia:Strona_główna']),
|
||||
'Portuguese': Language(name='Portuguese',
|
||||
iso_code='pt',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú',
|
||||
wiki_start_pages=[u'Wikipédia:Página_principal']),
|
||||
'Romanian': Language(name='Romanian',
|
||||
iso_code='ro',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=u'ăâîșțĂÂÎȘȚ',
|
||||
wiki_start_pages=[u'Pagina_principală']),
|
||||
'Russian': Language(name='Russian',
|
||||
iso_code='ru',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||
'KOI8-R', 'MacCyrillic', 'IBM866',
|
||||
'IBM855'],
|
||||
alphabet=(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
|
||||
u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'),
|
||||
wiki_start_pages=[u'Заглавная_страница']),
|
||||
'Slovak': Language(name='Slovak',
|
||||
iso_code='sk',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=u'áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ',
|
||||
wiki_start_pages=[u'Hlavná_stránka']),
|
||||
'Slovene': Language(name='Slovene',
|
||||
iso_code='sl',
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=(u'abcčdefghijklmnoprsštuvzž'
|
||||
u'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'),
|
||||
wiki_start_pages=[u'Glavna_stran']),
|
||||
# Serbian can be written in both Latin and Cyrillic, but there's no
|
||||
# simple way to get the Latin alphabet pages from Wikipedia through
|
||||
# the API, so for now we just support Cyrillic.
|
||||
'Serbian': Language(name='Serbian',
|
||||
iso_code='sr',
|
||||
alphabet=(u'АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ'
|
||||
u'абвгдђежзијклљмнњопрстћуфхцчџш'),
|
||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||
'MacCyrillic', 'IBM855'],
|
||||
wiki_start_pages=[u'Главна_страна']),
|
||||
'Thai': Language(name='Thai',
|
||||
iso_code='th',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-11', 'TIS-620', 'CP874'],
|
||||
alphabet=u'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛',
|
||||
wiki_start_pages=[u'หน้าหลัก']),
|
||||
'Turkish': Language(name='Turkish',
|
||||
iso_code='tr',
|
||||
# Q, W, and X are not used by Turkish
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-3', 'ISO-8859-9',
|
||||
'WINDOWS-1254'],
|
||||
alphabet=(u'abcçdefgğhıijklmnoöprsştuüvyzâîû'
|
||||
u'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'),
|
||||
wiki_start_pages=[u'Ana_Sayfa']),
|
||||
'Vietnamese': Language(name='Vietnamese',
|
||||
iso_code='vi',
|
||||
use_ascii=False,
|
||||
# Windows-1258 is the only common 8-bit
|
||||
# Vietnamese encoding supported by Python.
|
||||
# From Wikipedia:
|
||||
# For systems that lack support for Unicode,
|
||||
# dozens of 8-bit Vietnamese code pages are
|
||||
# available.[1] The most common are VISCII
|
||||
# (TCVN 5712:1993), VPS, and Windows-1258.[3]
|
||||
# Where ASCII is required, such as when
|
||||
# ensuring readability in plain text e-mail,
|
||||
# Vietnamese letters are often encoded
|
||||
# according to Vietnamese Quoted-Readable
|
||||
# (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
|
||||
# though usage of either variable-width
|
||||
# scheme has declined dramatically following
|
||||
# the adoption of Unicode on the World Wide
|
||||
# Web.
|
||||
charsets=['WINDOWS-1258'],
|
||||
alphabet=(u'aăâbcdđeêghiklmnoôơpqrstuưvxy'
|
||||
u'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'),
|
||||
wiki_start_pages=[u'Chữ_Quốc_ngữ']),
|
||||
}
|
|
@ -1,145 +0,0 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
# Shy Shalom - original C code
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from collections import namedtuple
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import CharacterCategory, ProbingState, SequenceLikelihood
|
||||
|
||||
|
||||
SingleByteCharSetModel = namedtuple('SingleByteCharSetModel',
|
||||
['charset_name',
|
||||
'language',
|
||||
'char_to_order_map',
|
||||
'language_model',
|
||||
'typical_positive_ratio',
|
||||
'keep_ascii_letters',
|
||||
'alphabet'])
|
||||
|
||||
|
||||
class SingleByteCharSetProber(CharSetProber):
|
||||
SAMPLE_SIZE = 64
|
||||
SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2
|
||||
POSITIVE_SHORTCUT_THRESHOLD = 0.95
|
||||
NEGATIVE_SHORTCUT_THRESHOLD = 0.05
|
||||
|
||||
def __init__(self, model, reversed=False, name_prober=None):
|
||||
super(SingleByteCharSetProber, self).__init__()
|
||||
self._model = model
|
||||
# TRUE if we need to reverse every pair in the model lookup
|
||||
self._reversed = reversed
|
||||
# Optional auxiliary prober for name decision
|
||||
self._name_prober = name_prober
|
||||
self._last_order = None
|
||||
self._seq_counters = None
|
||||
self._total_seqs = None
|
||||
self._total_char = None
|
||||
self._freq_char = None
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
super(SingleByteCharSetProber, self).reset()
|
||||
# char order of last character
|
||||
self._last_order = 255
|
||||
self._seq_counters = [0] * SequenceLikelihood.get_num_categories()
|
||||
self._total_seqs = 0
|
||||
self._total_char = 0
|
||||
# characters that fall in our sampling range
|
||||
self._freq_char = 0
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
if self._name_prober:
|
||||
return self._name_prober.charset_name
|
||||
else:
|
||||
return self._model.charset_name
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
if self._name_prober:
|
||||
return self._name_prober.language
|
||||
else:
|
||||
return self._model.language
|
||||
|
||||
def feed(self, byte_str):
|
||||
# TODO: Make filter_international_words keep things in self.alphabet
|
||||
if not self._model.keep_ascii_letters:
|
||||
byte_str = self.filter_international_words(byte_str)
|
||||
if not byte_str:
|
||||
return self.state
|
||||
char_to_order_map = self._model.char_to_order_map
|
||||
language_model = self._model.language_model
|
||||
for char in byte_str:
|
||||
order = char_to_order_map.get(char, CharacterCategory.UNDEFINED)
|
||||
# XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but
|
||||
# CharacterCategory.SYMBOL is actually 253, so we use CONTROL
|
||||
# to make it closer to the original intent. The only difference
|
||||
# is whether or not we count digits and control characters for
|
||||
# _total_char purposes.
|
||||
if order < CharacterCategory.CONTROL:
|
||||
self._total_char += 1
|
||||
# TODO: Follow uchardet's lead and discount confidence for frequent
|
||||
# control characters.
|
||||
# See https://github.com/BYVoid/uchardet/commit/55b4f23971db61
|
||||
if order < self.SAMPLE_SIZE:
|
||||
self._freq_char += 1
|
||||
if self._last_order < self.SAMPLE_SIZE:
|
||||
self._total_seqs += 1
|
||||
if not self._reversed:
|
||||
lm_cat = language_model[self._last_order][order]
|
||||
else:
|
||||
lm_cat = language_model[order][self._last_order]
|
||||
self._seq_counters[lm_cat] += 1
|
||||
self._last_order = order
|
||||
|
||||
charset_name = self._model.charset_name
|
||||
if self.state == ProbingState.DETECTING:
|
||||
if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
|
||||
confidence = self.get_confidence()
|
||||
if confidence > self.POSITIVE_SHORTCUT_THRESHOLD:
|
||||
self.logger.debug('%s confidence = %s, we have a winner',
|
||||
charset_name, confidence)
|
||||
self._state = ProbingState.FOUND_IT
|
||||
elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD:
|
||||
self.logger.debug('%s confidence = %s, below negative '
|
||||
'shortcut threshhold %s', charset_name,
|
||||
confidence,
|
||||
self.NEGATIVE_SHORTCUT_THRESHOLD)
|
||||
self._state = ProbingState.NOT_ME
|
||||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
r = 0.01
|
||||
if self._total_seqs > 0:
|
||||
r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) /
|
||||
self._total_seqs / self._model.typical_positive_ratio)
|
||||
r = r * self._freq_char / self._total_char
|
||||
if r >= 1.0:
|
||||
r = 0.99
|
||||
return r
|
|
@ -1,83 +0,0 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
# Shy Shalom - original C code
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .charsetgroupprober import CharSetGroupProber
|
||||
from .hebrewprober import HebrewProber
|
||||
from .langbulgarianmodel import (ISO_8859_5_BULGARIAN_MODEL,
|
||||
WINDOWS_1251_BULGARIAN_MODEL)
|
||||
from .langgreekmodel import ISO_8859_7_GREEK_MODEL, WINDOWS_1253_GREEK_MODEL
|
||||
from .langhebrewmodel import WINDOWS_1255_HEBREW_MODEL
|
||||
# from .langhungarianmodel import (ISO_8859_2_HUNGARIAN_MODEL,
|
||||
# WINDOWS_1250_HUNGARIAN_MODEL)
|
||||
from .langrussianmodel import (IBM855_RUSSIAN_MODEL, IBM866_RUSSIAN_MODEL,
|
||||
ISO_8859_5_RUSSIAN_MODEL, KOI8_R_RUSSIAN_MODEL,
|
||||
MACCYRILLIC_RUSSIAN_MODEL,
|
||||
WINDOWS_1251_RUSSIAN_MODEL)
|
||||
from .langthaimodel import TIS_620_THAI_MODEL
|
||||
from .langturkishmodel import ISO_8859_9_TURKISH_MODEL
|
||||
from .sbcharsetprober import SingleByteCharSetProber
|
||||
|
||||
|
||||
class SBCSGroupProber(CharSetGroupProber):
|
||||
def __init__(self):
|
||||
super(SBCSGroupProber, self).__init__()
|
||||
hebrew_prober = HebrewProber()
|
||||
logical_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
|
||||
False, hebrew_prober)
|
||||
# TODO: See if using ISO-8859-8 Hebrew model works better here, since
|
||||
# it's actually the visual one
|
||||
visual_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
|
||||
True, hebrew_prober)
|
||||
hebrew_prober.set_model_probers(logical_hebrew_prober,
|
||||
visual_hebrew_prober)
|
||||
# TODO: ORDER MATTERS HERE. I changed the order vs what was in master
|
||||
# and several tests failed that did not before. Some thought
|
||||
# should be put into the ordering, and we should consider making
|
||||
# order not matter here, because that is very counter-intuitive.
|
||||
self.probers = [
|
||||
SingleByteCharSetProber(WINDOWS_1251_RUSSIAN_MODEL),
|
||||
SingleByteCharSetProber(KOI8_R_RUSSIAN_MODEL),
|
||||
SingleByteCharSetProber(ISO_8859_5_RUSSIAN_MODEL),
|
||||
SingleByteCharSetProber(MACCYRILLIC_RUSSIAN_MODEL),
|
||||
SingleByteCharSetProber(IBM866_RUSSIAN_MODEL),
|
||||
SingleByteCharSetProber(IBM855_RUSSIAN_MODEL),
|
||||
SingleByteCharSetProber(ISO_8859_7_GREEK_MODEL),
|
||||
SingleByteCharSetProber(WINDOWS_1253_GREEK_MODEL),
|
||||
SingleByteCharSetProber(ISO_8859_5_BULGARIAN_MODEL),
|
||||
SingleByteCharSetProber(WINDOWS_1251_BULGARIAN_MODEL),
|
||||
# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250)
|
||||
# after we retrain model.
|
||||
# SingleByteCharSetProber(ISO_8859_2_HUNGARIAN_MODEL),
|
||||
# SingleByteCharSetProber(WINDOWS_1250_HUNGARIAN_MODEL),
|
||||
SingleByteCharSetProber(TIS_620_THAI_MODEL),
|
||||
SingleByteCharSetProber(ISO_8859_9_TURKISH_MODEL),
|
||||
hebrew_prober,
|
||||
logical_hebrew_prober,
|
||||
visual_hebrew_prober,
|
||||
]
|
||||
self.reset()
|
|
@ -1,92 +0,0 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is mozilla.org code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 1998
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import SJISDistributionAnalysis
|
||||
from .jpcntx import SJISContextAnalysis
|
||||
from .mbcssm import SJIS_SM_MODEL
|
||||
from .enums import ProbingState, MachineState
|
||||
|
||||
|
||||
class SJISProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
super(SJISProber, self).__init__()
|
||||
self.coding_sm = CodingStateMachine(SJIS_SM_MODEL)
|
||||
self.distribution_analyzer = SJISDistributionAnalysis()
|
||||
self.context_analyzer = SJISContextAnalysis()
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
super(SJISProber, self).reset()
|
||||
self.context_analyzer.reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
return self.context_analyzer.charset_name
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
return "Japanese"
|
||||
|
||||
def feed(self, byte_str):
|
||||
for i in range(len(byte_str)):
|
||||
coding_state = self.coding_sm.next_state(byte_str[i])
|
||||
if coding_state == MachineState.ERROR:
|
||||
self.logger.debug('%s %s prober hit error at byte %s',
|
||||
self.charset_name, self.language, i)
|
||||
self._state = ProbingState.NOT_ME
|
||||
break
|
||||
elif coding_state == MachineState.ITS_ME:
|
||||
self._state = ProbingState.FOUND_IT
|
||||
break
|
||||
elif coding_state == MachineState.START:
|
||||
char_len = self.coding_sm.get_current_charlen()
|
||||
if i == 0:
|
||||
self._last_char[1] = byte_str[0]
|
||||
self.context_analyzer.feed(self._last_char[2 - char_len:],
|
||||
char_len)
|
||||
self.distribution_analyzer.feed(self._last_char, char_len)
|
||||
else:
|
||||
self.context_analyzer.feed(byte_str[i + 1 - char_len:i + 3
|
||||
- char_len], char_len)
|
||||
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
|
||||
char_len)
|
||||
|
||||
self._last_char[0] = byte_str[-1]
|
||||
|
||||
if self.state == ProbingState.DETECTING:
|
||||
if (self.context_analyzer.got_enough_data() and
|
||||
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
||||
self._state = ProbingState.FOUND_IT
|
||||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
context_conf = self.context_analyzer.get_confidence()
|
||||
distrib_conf = self.distribution_analyzer.get_confidence()
|
||||
return max(context_conf, distrib_conf)
|
|
@ -1,286 +0,0 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Mark Pilgrim - port to Python
|
||||
# Shy Shalom - original C code
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
"""
|
||||
Module containing the UniversalDetector detector class, which is the primary
|
||||
class a user of ``chardet`` should use.
|
||||
|
||||
:author: Mark Pilgrim (initial port to Python)
|
||||
:author: Shy Shalom (original C code)
|
||||
:author: Dan Blanchard (major refactoring for 3.0)
|
||||
:author: Ian Cordasco
|
||||
"""
|
||||
|
||||
|
||||
import codecs
|
||||
import logging
|
||||
import re
|
||||
|
||||
from .charsetgroupprober import CharSetGroupProber
|
||||
from .enums import InputState, LanguageFilter, ProbingState
|
||||
from .escprober import EscCharSetProber
|
||||
from .latin1prober import Latin1Prober
|
||||
from .mbcsgroupprober import MBCSGroupProber
|
||||
from .sbcsgroupprober import SBCSGroupProber
|
||||
|
||||
|
||||
class UniversalDetector(object):
|
||||
"""
|
||||
The ``UniversalDetector`` class underlies the ``chardet.detect`` function
|
||||
and coordinates all of the different charset probers.
|
||||
|
||||
To get a ``dict`` containing an encoding and its confidence, you can simply
|
||||
run:
|
||||
|
||||
.. code::
|
||||
|
||||
u = UniversalDetector()
|
||||
u.feed(some_bytes)
|
||||
u.close()
|
||||
detected = u.result
|
||||
|
||||
"""
|
||||
|
||||
MINIMUM_THRESHOLD = 0.20
|
||||
HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]')
|
||||
ESC_DETECTOR = re.compile(b'(\033|~{)')
|
||||
WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]')
|
||||
ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252',
|
||||
'iso-8859-2': 'Windows-1250',
|
||||
'iso-8859-5': 'Windows-1251',
|
||||
'iso-8859-6': 'Windows-1256',
|
||||
'iso-8859-7': 'Windows-1253',
|
||||
'iso-8859-8': 'Windows-1255',
|
||||
'iso-8859-9': 'Windows-1254',
|
||||
'iso-8859-13': 'Windows-1257'}
|
||||
|
||||
def __init__(self, lang_filter=LanguageFilter.ALL):
|
||||
self._esc_charset_prober = None
|
||||
self._charset_probers = []
|
||||
self.result = None
|
||||
self.done = None
|
||||
self._got_data = None
|
||||
self._input_state = None
|
||||
self._last_char = None
|
||||
self.lang_filter = lang_filter
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._has_win_bytes = None
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
Reset the UniversalDetector and all of its probers back to their
|
||||
initial states. This is called by ``__init__``, so you only need to
|
||||
call this directly in between analyses of different documents.
|
||||
"""
|
||||
self.result = {'encoding': None, 'confidence': 0.0, 'language': None}
|
||||
self.done = False
|
||||
self._got_data = False
|
||||
self._has_win_bytes = False
|
||||
self._input_state = InputState.PURE_ASCII
|
||||
self._last_char = b''
|
||||
if self._esc_charset_prober:
|
||||
self._esc_charset_prober.reset()
|
||||
for prober in self._charset_probers:
|
||||
prober.reset()
|
||||
|
||||
def feed(self, byte_str):
|
||||
"""
|
||||
Takes a chunk of a document and feeds it through all of the relevant
|
||||
charset probers.
|
||||
|
||||
After calling ``feed``, you can check the value of the ``done``
|
||||
attribute to see if you need to continue feeding the
|
||||
``UniversalDetector`` more data, or if it has made a prediction
|
||||
(in the ``result`` attribute).
|
||||
|
||||
.. note::
|
||||
You should always call ``close`` when you're done feeding in your
|
||||
document if ``done`` is not already ``True``.
|
||||
"""
|
||||
if self.done:
|
||||
return
|
||||
|
||||
if not len(byte_str):
|
||||
return
|
||||
|
||||
if not isinstance(byte_str, bytearray):
|
||||
byte_str = bytearray(byte_str)
|
||||
|
||||
# First check for known BOMs, since these are guaranteed to be correct
|
||||
if not self._got_data:
|
||||
# If the data starts with BOM, we know it is UTF
|
||||
if byte_str.startswith(codecs.BOM_UTF8):
|
||||
# EF BB BF UTF-8 with BOM
|
||||
self.result = {'encoding': "UTF-8-SIG",
|
||||
'confidence': 1.0,
|
||||
'language': ''}
|
||||
elif byte_str.startswith((codecs.BOM_UTF32_LE,
|
||||
codecs.BOM_UTF32_BE)):
|
||||
# FF FE 00 00 UTF-32, little-endian BOM
|
||||
# 00 00 FE FF UTF-32, big-endian BOM
|
||||
self.result = {'encoding': "UTF-32",
|
||||
'confidence': 1.0,
|
||||
'language': ''}
|
||||
elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
|
||||
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
||||
self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
|
||||
'confidence': 1.0,
|
||||
'language': ''}
|
||||
elif byte_str.startswith(b'\x00\x00\xFF\xFE'):
|
||||
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
||||
self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
|
||||
'confidence': 1.0,
|
||||
'language': ''}
|
||||
elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
|
||||
# FF FE UTF-16, little endian BOM
|
||||
# FE FF UTF-16, big endian BOM
|
||||
self.result = {'encoding': "UTF-16",
|
||||
'confidence': 1.0,
|
||||
'language': ''}
|
||||
|
||||
self._got_data = True
|
||||
if self.result['encoding'] is not None:
|
||||
self.done = True
|
||||
return
|
||||
|
||||
# If none of those matched and we've only see ASCII so far, check
|
||||
# for high bytes and escape sequences
|
||||
if self._input_state == InputState.PURE_ASCII:
|
||||
if self.HIGH_BYTE_DETECTOR.search(byte_str):
|
||||
self._input_state = InputState.HIGH_BYTE
|
||||
elif self._input_state == InputState.PURE_ASCII and \
|
||||
self.ESC_DETECTOR.search(self._last_char + byte_str):
|
||||
self._input_state = InputState.ESC_ASCII
|
||||
|
||||
self._last_char = byte_str[-1:]
|
||||
|
||||
# If we've seen escape sequences, use the EscCharSetProber, which
|
||||
# uses a simple state machine to check for known escape sequences in
|
||||
# HZ and ISO-2022 encodings, since those are the only encodings that
|
||||
# use such sequences.
|
||||
if self._input_state == InputState.ESC_ASCII:
|
||||
if not self._esc_charset_prober:
|
||||
self._esc_charset_prober = EscCharSetProber(self.lang_filter)
|
||||
if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
|
||||
self.result = {'encoding':
|
||||
self._esc_charset_prober.charset_name,
|
||||
'confidence':
|
||||
self._esc_charset_prober.get_confidence(),
|
||||
'language':
|
||||
self._esc_charset_prober.language}
|
||||
self.done = True
|
||||
# If we've seen high bytes (i.e., those with values greater than 127),
|
||||
# we need to do more complicated checks using all our multi-byte and
|
||||
# single-byte probers that are left. The single-byte probers
|
||||
# use character bigram distributions to determine the encoding, whereas
|
||||
# the multi-byte probers use a combination of character unigram and
|
||||
# bigram distributions.
|
||||
elif self._input_state == InputState.HIGH_BYTE:
|
||||
if not self._charset_probers:
|
||||
self._charset_probers = [MBCSGroupProber(self.lang_filter)]
|
||||
# If we're checking non-CJK encodings, use single-byte prober
|
||||
if self.lang_filter & LanguageFilter.NON_CJK:
|
||||
self._charset_probers.append(SBCSGroupProber())
|
||||
self._charset_probers.append(Latin1Prober())
|
||||
for prober in self._charset_probers:
|
||||
if prober.feed(byte_str) == ProbingState.FOUND_IT:
|
||||
self.result = {'encoding': prober.charset_name,
|
||||
'confidence': prober.get_confidence(),
|
||||
'language': prober.language}
|
||||
self.done = True
|
||||
break
|
||||
if self.WIN_BYTE_DETECTOR.search(byte_str):
|
||||
self._has_win_bytes = True
|
||||
|
||||
def close(self):
|
||||
"""
|
||||
Stop analyzing the current document and come up with a final
|
||||
prediction.
|
||||
|
||||
:returns: The ``result`` attribute, a ``dict`` with the keys
|
||||
`encoding`, `confidence`, and `language`.
|
||||
"""
|
||||
# Don't bother with checks if we're already done
|
||||
if self.done:
|
||||
return self.result
|
||||
self.done = True
|
||||
|
||||
if not self._got_data:
|
||||
self.logger.debug('no data received!')
|
||||
|
||||
# Default to ASCII if it is all we've seen so far
|
||||
elif self._input_state == InputState.PURE_ASCII:
|
||||
self.result = {'encoding': 'ascii',
|
||||
'confidence': 1.0,
|
||||
'language': ''}
|
||||
|
||||
# If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
|
||||
elif self._input_state == InputState.HIGH_BYTE:
|
||||
prober_confidence = None
|
||||
max_prober_confidence = 0.0
|
||||
max_prober = None
|
||||
for prober in self._charset_probers:
|
||||
if not prober:
|
||||
continue
|
||||
prober_confidence = prober.get_confidence()
|
||||
if prober_confidence > max_prober_confidence:
|
||||
max_prober_confidence = prober_confidence
|
||||
max_prober = prober
|
||||
if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
|
||||
charset_name = max_prober.charset_name
|
||||
lower_charset_name = max_prober.charset_name.lower()
|
||||
confidence = max_prober.get_confidence()
|
||||
# Use Windows encoding name instead of ISO-8859 if we saw any
|
||||
# extra Windows-specific bytes
|
||||
if lower_charset_name.startswith('iso-8859'):
|
||||
if self._has_win_bytes:
|
||||
charset_name = self.ISO_WIN_MAP.get(lower_charset_name,
|
||||
charset_name)
|
||||
self.result = {'encoding': charset_name,
|
||||
'confidence': confidence,
|
||||
'language': max_prober.language}
|
||||
|
||||
# Log all prober confidences if none met MINIMUM_THRESHOLD
|
||||
if self.logger.getEffectiveLevel() <= logging.DEBUG:
|
||||
if self.result['encoding'] is None:
|
||||
self.logger.debug('no probers hit minimum threshold')
|
||||
for group_prober in self._charset_probers:
|
||||
if not group_prober:
|
||||
continue
|
||||
if isinstance(group_prober, CharSetGroupProber):
|
||||
for prober in group_prober.probers:
|
||||
self.logger.debug('%s %s confidence = %s',
|
||||
prober.charset_name,
|
||||
prober.language,
|
||||
prober.get_confidence())
|
||||
else:
|
||||
self.logger.debug('%s %s confidence = %s',
|
||||
group_prober.charset_name,
|
||||
group_prober.language,
|
||||
group_prober.get_confidence())
|
||||
return self.result
|
|
@ -1,9 +0,0 @@
|
|||
"""
|
||||
This module exists only to simplify retrieving the version number of chardet
|
||||
from within setup.py and from chardet subpackages.
|
||||
|
||||
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
||||
"""
|
||||
|
||||
__version__ = "4.0.0"
|
||||
VERSION = __version__.split('.')
|
|
@ -1,12 +0,0 @@
|
|||
[bdist_wheel]
|
||||
universal = 1
|
||||
|
||||
[tool:pytest]
|
||||
addopts = -v
|
||||
python_files = test.py
|
||||
norecursedirs = *
|
||||
|
||||
[egg_info]
|
||||
tag_build =
|
||||
tag_date = 0
|
||||
|
|
@ -1,53 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
|
||||
# Get version without importing, which avoids dependency issues
|
||||
def get_version():
|
||||
import re
|
||||
with open('chardet/version.py') as version_file:
|
||||
return re.search(r"""__version__\s+=\s+(['"])(?P<version>.+?)\1""",
|
||||
version_file.read()).group('version')
|
||||
|
||||
|
||||
def readme():
|
||||
with open('README.rst') as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
setup(name='chardet',
|
||||
version=get_version(),
|
||||
description='Universal encoding detector for Python 2 and 3',
|
||||
long_description=readme(),
|
||||
author='Mark Pilgrim',
|
||||
author_email='mark@diveintomark.org',
|
||||
maintainer='Daniel Blanchard',
|
||||
maintainer_email='dan.blanchard@gmail.com',
|
||||
url='https://github.com/chardet/chardet',
|
||||
license="LGPL",
|
||||
keywords=['encoding', 'i18n', 'xml'],
|
||||
classifiers=["Development Status :: 5 - Production/Stable",
|
||||
"Intended Audience :: Developers",
|
||||
("License :: OSI Approved :: GNU Library or Lesser General"
|
||||
" Public License (LGPL)"),
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python",
|
||||
'Programming Language :: Python :: 2',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: 3.9',
|
||||
('Programming Language :: Python :: Implementation :: '
|
||||
'CPython'),
|
||||
'Programming Language :: Python :: Implementation :: PyPy',
|
||||
("Topic :: Software Development :: Libraries :: Python "
|
||||
"Modules"),
|
||||
"Topic :: Text Processing :: Linguistic"],
|
||||
packages=find_packages(),
|
||||
python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*",
|
||||
entry_points={'console_scripts':
|
||||
['chardetect = chardet.cli.chardetect:main']})
|
|
@ -1,146 +0,0 @@
|
|||
"""
|
||||
Run chardet on a bunch of documents and see that we get the correct encodings.
|
||||
|
||||
:author: Dan Blanchard
|
||||
:author: Ian Cordasco
|
||||
"""
|
||||
|
||||
from __future__ import with_statement
|
||||
|
||||
import textwrap
|
||||
from difflib import ndiff
|
||||
from io import open
|
||||
from os import listdir
|
||||
from os.path import dirname, isdir, join, realpath, relpath, splitext
|
||||
|
||||
try:
|
||||
import hypothesis.strategies as st
|
||||
from hypothesis import given, assume, settings, Verbosity
|
||||
HAVE_HYPOTHESIS = True
|
||||
except ImportError:
|
||||
HAVE_HYPOTHESIS = False
|
||||
import pytest
|
||||
|
||||
import chardet
|
||||
|
||||
|
||||
# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
|
||||
# retrain model.
|
||||
MISSING_ENCODINGS = {'iso-8859-2', 'iso-8859-6', 'windows-1250',
|
||||
'windows-1254', 'windows-1256'}
|
||||
EXPECTED_FAILURES = {'tests/iso-8859-7-greek/disabled.gr.xml',
|
||||
'tests/iso-8859-9-turkish/divxplanet.com.xml',
|
||||
'tests/iso-8859-9-turkish/subtitle.srt',
|
||||
'tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'}
|
||||
|
||||
def gen_test_params():
|
||||
"""Yields tuples of paths and encodings to use for test_encoding_detection"""
|
||||
base_path = relpath(join(dirname(realpath(__file__)), 'tests'))
|
||||
for encoding in listdir(base_path):
|
||||
path = join(base_path, encoding)
|
||||
# Skip files in tests directory
|
||||
if not isdir(path):
|
||||
continue
|
||||
# Remove language suffixes from encoding if pressent
|
||||
encoding = encoding.lower()
|
||||
for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek',
|
||||
'-hebrew', '-hungarian', '-turkish']:
|
||||
if encoding.endswith(postfix):
|
||||
encoding = encoding.rpartition(postfix)[0]
|
||||
break
|
||||
# Skip directories for encodings we don't handle yet.
|
||||
if encoding in MISSING_ENCODINGS:
|
||||
continue
|
||||
# Test encoding detection for each file we have of encoding for
|
||||
for file_name in listdir(path):
|
||||
ext = splitext(file_name)[1].lower()
|
||||
if ext not in ['.html', '.txt', '.xml', '.srt']:
|
||||
continue
|
||||
full_path = join(path, file_name)
|
||||
test_case = full_path, encoding
|
||||
if full_path in EXPECTED_FAILURES:
|
||||
test_case = pytest.param(*test_case, marks=pytest.mark.xfail)
|
||||
yield test_case
|
||||
|
||||
|
||||
@pytest.mark.parametrize ('file_name, encoding', gen_test_params())
|
||||
def test_encoding_detection(file_name, encoding):
|
||||
with open(file_name, 'rb') as f:
|
||||
input_bytes = f.read()
|
||||
result = chardet.detect(input_bytes)
|
||||
try:
|
||||
expected_unicode = input_bytes.decode(encoding)
|
||||
except LookupError:
|
||||
expected_unicode = ''
|
||||
try:
|
||||
detected_unicode = input_bytes.decode(result['encoding'])
|
||||
except (LookupError, UnicodeDecodeError, TypeError):
|
||||
detected_unicode = ''
|
||||
if result:
|
||||
encoding_match = (result['encoding'] or '').lower() == encoding
|
||||
else:
|
||||
encoding_match = False
|
||||
# Only care about mismatches that would actually result in different
|
||||
# behavior when decoding
|
||||
if not encoding_match and expected_unicode != detected_unicode:
|
||||
wrapped_expected = '\n'.join(textwrap.wrap(expected_unicode, 100)) + '\n'
|
||||
wrapped_detected = '\n'.join(textwrap.wrap(detected_unicode, 100)) + '\n'
|
||||
diff = ''.join(ndiff(wrapped_expected.splitlines(True),
|
||||
wrapped_detected.splitlines(True)))
|
||||
else:
|
||||
diff = ''
|
||||
encoding_match = True
|
||||
assert encoding_match, ("Expected %s, but got %s for %s. Character "
|
||||
"differences: \n%s" % (encoding,
|
||||
result,
|
||||
file_name,
|
||||
diff))
|
||||
|
||||
|
||||
if HAVE_HYPOTHESIS:
|
||||
class JustALengthIssue(Exception):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
|
||||
'utf-32', 'iso-8859-7',
|
||||
'iso-8859-8', 'windows-1255']),
|
||||
st.randoms())
|
||||
@settings(max_examples=200)
|
||||
def test_never_fails_to_detect_if_there_is_a_valid_encoding(txt, enc, rnd):
|
||||
try:
|
||||
data = txt.encode(enc)
|
||||
except UnicodeEncodeError:
|
||||
assume(False)
|
||||
detected = chardet.detect(data)['encoding']
|
||||
if detected is None:
|
||||
with pytest.raises(JustALengthIssue):
|
||||
@given(st.text(), random=rnd)
|
||||
@settings(verbosity=Verbosity.quiet, max_shrinks=0, max_examples=50)
|
||||
def string_poisons_following_text(suffix):
|
||||
try:
|
||||
extended = (txt + suffix).encode(enc)
|
||||
except UnicodeEncodeError:
|
||||
assume(False)
|
||||
result = chardet.detect(extended)
|
||||
if result and result['encoding'] is not None:
|
||||
raise JustALengthIssue()
|
||||
|
||||
|
||||
@given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
|
||||
'utf-32', 'iso-8859-7',
|
||||
'iso-8859-8', 'windows-1255']),
|
||||
st.randoms())
|
||||
@settings(max_examples=200)
|
||||
def test_detect_all_and_detect_one_should_agree(txt, enc, rnd):
|
||||
try:
|
||||
data = txt.encode(enc)
|
||||
except UnicodeEncodeError:
|
||||
assume(False)
|
||||
try:
|
||||
result = chardet.detect(data)
|
||||
results = chardet.detect_all(data)
|
||||
assert result['encoding'] == results[0]['encoding']
|
||||
except Exception:
|
||||
raise Exception('%s != %s' % (result, results))
|
|
@ -1,162 +0,0 @@
|
|||
.. :changelog:
|
||||
|
||||
History
|
||||
-------
|
||||
|
||||
2.10 (2020-06-27)
|
||||
+++++++++++++++++
|
||||
|
||||
- Update to Unicode 13.0.0.
|
||||
- Throws a more specific exception if "xn--" is provided as a label.
|
||||
- This is expected to be the last version that supports Python 2.
|
||||
|
||||
2.9 (2020-02-16)
|
||||
++++++++++++++++
|
||||
|
||||
- Update to Unicode 12.1.0.
|
||||
- Prohibit A-labels ending with a hyphen (Thanks, Julien Bernard!)
|
||||
- Future-proofing: Test on Python 3.7 and 3.8, don't immediately
|
||||
fail should Python 4 come along.
|
||||
- Made BSD 3-clause license clearer
|
||||
|
||||
2.8 (2018-12-04)
|
||||
++++++++++++++++
|
||||
|
||||
- Update to Unicode 11.0.0.
|
||||
- Provide more specific exceptions for some malformed labels.
|
||||
|
||||
2.7 (2018-06-10)
|
||||
++++++++++++++++
|
||||
|
||||
- Update to Unicode 10.0.0.
|
||||
- No longer accepts dot-prefixed domains (e.g. ".example") as valid.
|
||||
This is to be more conformant with the UTS 46 spec. Users should
|
||||
strip dot prefixes from domains before processing.
|
||||
|
||||
2.6 (2017-08-08)
|
||||
++++++++++++++++
|
||||
|
||||
- Allows generation of IDNA and UTS 46 table data for different
|
||||
versions of Unicode, by deriving properties directly from
|
||||
Unicode data.
|
||||
- Ability to generate RFC 5892/IANA-style table data
|
||||
- Diagnostic output of IDNA-related Unicode properties and
|
||||
derived calculations for a given codepoint
|
||||
- Support for idna.__version__ to report version
|
||||
- Support for idna.idnadata.__version__ and
|
||||
idna.uts46data.__version__ to report Unicode version of
|
||||
underlying IDNA and UTS 46 data respectively.
|
||||
|
||||
2.5 (2017-03-07)
|
||||
++++++++++++++++
|
||||
|
||||
- Fix bug with Katakana middle dot context-rule (Thanks, Greg
|
||||
Shikhman.)
|
||||
|
||||
2.4 (2017-03-01)
|
||||
++++++++++++++++
|
||||
|
||||
- Restore IDNAError to be a subclass of UnicodeError, as some users of
|
||||
this library are only looking for the latter to catch invalid strings.
|
||||
|
||||
2.3 (2017-02-28)
|
||||
++++++++++++++++
|
||||
|
||||
- Fix bugs relating to deriving IDNAError from UnicodeError.
|
||||
- More memory footprint improvements (Thanks, Alex Gaynor)
|
||||
|
||||
2.2 (2016-12-21)
|
||||
++++++++++++++++
|
||||
|
||||
- Made some changes to the UTS 46 data that should allow Jython to get around
|
||||
64kb Java class limits. (Thanks, John A. Booth and Marcin Płonka.)
|
||||
- In Python 2.6, skip two tests that rely on data not present in that
|
||||
Python version's unicodedata module.
|
||||
- Use relative imports to help downstream users.
|
||||
|
||||
2.1 (2016-03-20)
|
||||
++++++++++++++++
|
||||
|
||||
- Memory consumption optimizations. The library should consume significantly
|
||||
less memory through smarter data structures being used to represent
|
||||
relevant Unicode properties. Many thanks to Shivaram Lingamneni for this
|
||||
patch.
|
||||
- Patches to make library work better with Python 2.6. The core library
|
||||
currently works however the unit testing does not. (Thanks, Robert
|
||||
Buchholz)
|
||||
- Better affix all Unicode codepoint properties to a specific version.
|
||||
|
||||
2.0 (2015-05-18)
|
||||
++++++++++++++++
|
||||
|
||||
- Added support for Unicode IDNA Compatibility Processing (aka Unicode
|
||||
Technical Standard #46). Big thanks to Jon Ribbens who contributed this
|
||||
functionality.
|
||||
|
||||
1.1 (2015-01-27)
|
||||
++++++++++++++++
|
||||
|
||||
- Use IDNA properties from Unicode 6.3.0. Internet Architecture Board (IAB)
|
||||
issued statement recommending against the use of Unicode 7.0.0 until
|
||||
issues relating to U+08A1 codepoint are resolved. See http://goo.gl/Ed1n0K
|
||||
- Identify some cases when label would be too longer to be a legal DNS name
|
||||
and raise an exception. (Thanks, Ed Lewis)
|
||||
|
||||
1.0 (2014-10-12)
|
||||
++++++++++++++++
|
||||
|
||||
- Update IDNA properties for Unicode 7.0.0.
|
||||
|
||||
0.9 (2014-07-18)
|
||||
++++++++++++++++
|
||||
|
||||
- Fix issue with non-UTF-8 environments reading the README file
|
||||
now that it contains non-ASCII. (Thanks, Tom Prince)
|
||||
- Codec functions are useful, so they are separated into their own
|
||||
module, rather than just existing for compatibility reasons.
|
||||
- Add LICENSE file.
|
||||
|
||||
0.8 (2014-07-09)
|
||||
++++++++++++++++
|
||||
|
||||
- Added MANIFEST.in for correct source distribution compilation.
|
||||
|
||||
0.7 (2014-07-09)
|
||||
++++++++++++++++
|
||||
|
||||
- Filled out missing tests for various functions.
|
||||
- Fix bug in CONTEXTO validation for Greek lower numeral sign (U+0375)
|
||||
- Fix bug in CONTEXTO validation for Japanese middle dot (U+30FB)
|
||||
- Improved documentation
|
||||
- Move designation to Stable
|
||||
|
||||
0.6 (2014-04-29)
|
||||
++++++++++++++++
|
||||
|
||||
- Minor improvements to Python 3 support, tests (Thanks, Derek Wilson)
|
||||
|
||||
0.5 (2014-02-05)
|
||||
++++++++++++++++
|
||||
|
||||
- Update IDNA properties for Unicode 6.3.0.
|
||||
|
||||
0.4 (2014-01-07)
|
||||
++++++++++++++++
|
||||
|
||||
- Fix trove classifier for Python 3. (Thanks, Hynek Schlawack)
|
||||
|
||||
0.3 (2013-07-18)
|
||||
++++++++++++++++
|
||||
|
||||
- Ported to Python 3.
|
||||
|
||||
0.2 (2013-07-16)
|
||||
++++++++++++++++
|
||||
|
||||
- Improve packaging.
|
||||
- More conformant, passes all relevant tests in the Unicode TR46 test suite.
|
||||
|
||||
0.1 (2013-05-27)
|
||||
++++++++++++++++
|
||||
|
||||
- First proof-of-concept version.
|
|
@ -1,34 +0,0 @@
|
|||
License
|
||||
-------
|
||||
|
||||
License: bsd-3-clause
|
||||
|
||||
Copyright (c) 2013-2020, Kim Davies. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
#. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
#. Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following
|
||||
disclaimer in the documentation and/or other materials provided with
|
||||
the distribution.
|
||||
|
||||
#. Neither the name of the copyright holder nor the names of the
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
#. THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS "AS IS" AND ANY
|
||||
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
|
||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
|
||||
DAMAGE.
|
|
@ -1,6 +0,0 @@
|
|||
include *.rst
|
||||
recursive-include tools *
|
||||
recursive-exclude tools *.pyc
|
||||
recursive-include tests *
|
||||
recursive-exclude tests *.pyc
|
||||
|
|
@ -1,241 +0,0 @@
|
|||
Metadata-Version: 1.2
|
||||
Name: idna
|
||||
Version: 2.10
|
||||
Summary: Internationalized Domain Names in Applications (IDNA)
|
||||
Home-page: https://github.com/kjd/idna
|
||||
Author: Kim Davies
|
||||
Author-email: kim@cynosure.com.au
|
||||
License: BSD-like
|
||||
Description: Internationalized Domain Names in Applications (IDNA)
|
||||
=====================================================
|
||||
|
||||
Support for the Internationalised Domain Names in Applications
|
||||
(IDNA) protocol as specified in `RFC 5891 <http://tools.ietf.org/html/rfc5891>`_.
|
||||
This is the latest version of the protocol and is sometimes referred to as
|
||||
“IDNA 2008”.
|
||||
|
||||
This library also provides support for Unicode Technical Standard 46,
|
||||
`Unicode IDNA Compatibility Processing <http://unicode.org/reports/tr46/>`_.
|
||||
|
||||
This acts as a suitable replacement for the “encodings.idna” module that
|
||||
comes with the Python standard library, but only supports the
|
||||
old, deprecated IDNA specification (`RFC 3490 <http://tools.ietf.org/html/rfc3490>`_).
|
||||
|
||||
Basic functions are simply executed:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
# Python 3
|
||||
>>> import idna
|
||||
>>> idna.encode('ドメイン.テスト')
|
||||
b'xn--eckwd4c7c.xn--zckzah'
|
||||
>>> print(idna.decode('xn--eckwd4c7c.xn--zckzah'))
|
||||
ドメイン.テスト
|
||||
|
||||
# Python 2
|
||||
>>> import idna
|
||||
>>> idna.encode(u'ドメイン.テスト')
|
||||
'xn--eckwd4c7c.xn--zckzah'
|
||||
>>> print idna.decode('xn--eckwd4c7c.xn--zckzah')
|
||||
ドメイン.テスト
|
||||
|
||||
Packages
|
||||
--------
|
||||
|
||||
The latest tagged release version is published in the PyPI repository:
|
||||
|
||||
.. image:: https://badge.fury.io/py/idna.svg
|
||||
:target: http://badge.fury.io/py/idna
|
||||
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
To install this library, you can use pip:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ pip install idna
|
||||
|
||||
Alternatively, you can install the package using the bundled setup script:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ python setup.py install
|
||||
|
||||
This library works with Python 2.7 and Python 3.4 or later.
|
||||
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
For typical usage, the ``encode`` and ``decode`` functions will take a domain
|
||||
name argument and perform a conversion to A-labels or U-labels respectively.
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
# Python 3
|
||||
>>> import idna
|
||||
>>> idna.encode('ドメイン.テスト')
|
||||
b'xn--eckwd4c7c.xn--zckzah'
|
||||
>>> print(idna.decode('xn--eckwd4c7c.xn--zckzah'))
|
||||
ドメイン.テスト
|
||||
|
||||
You may use the codec encoding and decoding methods using the
|
||||
``idna.codec`` module:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
# Python 2
|
||||
>>> import idna.codec
|
||||
>>> print u'домена.испытание'.encode('idna')
|
||||
xn--80ahd1agd.xn--80akhbyknj4f
|
||||
>>> print 'xn--80ahd1agd.xn--80akhbyknj4f'.decode('idna')
|
||||
домена.испытание
|
||||
|
||||
Conversions can be applied at a per-label basis using the ``ulabel`` or ``alabel``
|
||||
functions if necessary:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
# Python 2
|
||||
>>> idna.alabel(u'测试')
|
||||
'xn--0zwm56d'
|
||||
|
||||
Compatibility Mapping (UTS #46)
|
||||
+++++++++++++++++++++++++++++++
|
||||
|
||||
As described in `RFC 5895 <http://tools.ietf.org/html/rfc5895>`_, the IDNA
|
||||
specification no longer normalizes input from different potential ways a user
|
||||
may input a domain name. This functionality, known as a “mapping”, is now
|
||||
considered by the specification to be a local user-interface issue distinct
|
||||
from IDNA conversion functionality.
|
||||
|
||||
This library provides one such mapping, that was developed by the Unicode
|
||||
Consortium. Known as `Unicode IDNA Compatibility Processing <http://unicode.org/reports/tr46/>`_,
|
||||
it provides for both a regular mapping for typical applications, as well as
|
||||
a transitional mapping to help migrate from older IDNA 2003 applications.
|
||||
|
||||
For example, “Königsgäßchen” is not a permissible label as *LATIN CAPITAL
|
||||
LETTER K* is not allowed (nor are capital letters in general). UTS 46 will
|
||||
convert this into lower case prior to applying the IDNA conversion.
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
# Python 3
|
||||
>>> import idna
|
||||
>>> idna.encode(u'Königsgäßchen')
|
||||
...
|
||||
idna.core.InvalidCodepoint: Codepoint U+004B at position 1 of 'Königsgäßchen' not allowed
|
||||
>>> idna.encode('Königsgäßchen', uts46=True)
|
||||
b'xn--knigsgchen-b4a3dun'
|
||||
>>> print(idna.decode('xn--knigsgchen-b4a3dun'))
|
||||
königsgäßchen
|
||||
|
||||
Transitional processing provides conversions to help transition from the older
|
||||
2003 standard to the current standard. For example, in the original IDNA
|
||||
specification, the *LATIN SMALL LETTER SHARP S* (ß) was converted into two
|
||||
*LATIN SMALL LETTER S* (ss), whereas in the current IDNA specification this
|
||||
conversion is not performed.
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
# Python 2
|
||||
>>> idna.encode(u'Königsgäßchen', uts46=True, transitional=True)
|
||||
'xn--knigsgsschen-lcb0w'
|
||||
|
||||
Implementors should use transitional processing with caution, only in rare
|
||||
cases where conversion from legacy labels to current labels must be performed
|
||||
(i.e. IDNA implementations that pre-date 2008). For typical applications
|
||||
that just need to convert labels, transitional processing is unlikely to be
|
||||
beneficial and could produce unexpected incompatible results.
|
||||
|
||||
``encodings.idna`` Compatibility
|
||||
++++++++++++++++++++++++++++++++
|
||||
|
||||
Function calls from the Python built-in ``encodings.idna`` module are
|
||||
mapped to their IDNA 2008 equivalents using the ``idna.compat`` module.
|
||||
Simply substitute the ``import`` clause in your code to refer to the
|
||||
new module name.
|
||||
|
||||
Exceptions
|
||||
----------
|
||||
|
||||
All errors raised during the conversion following the specification should
|
||||
raise an exception derived from the ``idna.IDNAError`` base class.
|
||||
|
||||
More specific exceptions that may be generated as ``idna.IDNABidiError``
|
||||
when the error reflects an illegal combination of left-to-right and right-to-left
|
||||
characters in a label; ``idna.InvalidCodepoint`` when a specific codepoint is
|
||||
an illegal character in an IDN label (i.e. INVALID); and ``idna.InvalidCodepointContext``
|
||||
when the codepoint is illegal based on its positional context (i.e. it is CONTEXTO
|
||||
or CONTEXTJ but the contextual requirements are not satisfied.)
|
||||
|
||||
Building and Diagnostics
|
||||
------------------------
|
||||
|
||||
The IDNA and UTS 46 functionality relies upon pre-calculated lookup tables for
|
||||
performance. These tables are derived from computing against eligibility criteria
|
||||
in the respective standards. These tables are computed using the command-line
|
||||
script ``tools/idna-data``.
|
||||
|
||||
This tool will fetch relevant tables from the Unicode Consortium and perform the
|
||||
required calculations to identify eligibility. It has three main modes:
|
||||
|
||||
* ``idna-data make-libdata``. Generates ``idnadata.py`` and ``uts46data.py``,
|
||||
the pre-calculated lookup tables using for IDNA and UTS 46 conversions. Implementors
|
||||
who wish to track this library against a different Unicode version may use this tool
|
||||
to manually generate a different version of the ``idnadata.py`` and ``uts46data.py``
|
||||
files.
|
||||
|
||||
* ``idna-data make-table``. Generate a table of the IDNA disposition
|
||||
(e.g. PVALID, CONTEXTJ, CONTEXTO) in the format found in Appendix B.1 of RFC
|
||||
5892 and the pre-computed tables published by `IANA <http://iana.org/>`_.
|
||||
|
||||
* ``idna-data U+0061``. Prints debugging output on the various properties
|
||||
associated with an individual Unicode codepoint (in this case, U+0061), that are
|
||||
used to assess the IDNA and UTS 46 status of a codepoint. This is helpful in debugging
|
||||
or analysis.
|
||||
|
||||
The tool accepts a number of arguments, described using ``idna-data -h``. Most notably,
|
||||
the ``--version`` argument allows the specification of the version of Unicode to use
|
||||
in computing the table data. For example, ``idna-data --version 9.0.0 make-libdata``
|
||||
will generate library data against Unicode 9.0.0.
|
||||
|
||||
Note that this script requires Python 3, but all generated library data will work
|
||||
in Python 2.7.
|
||||
|
||||
|
||||
Testing
|
||||
-------
|
||||
|
||||
The library has a test suite based on each rule of the IDNA specification, as
|
||||
well as tests that are provided as part of the Unicode Technical Standard 46,
|
||||
`Unicode IDNA Compatibility Processing <http://unicode.org/reports/tr46/>`_.
|
||||
|
||||
The tests are run automatically on each commit at Travis CI:
|
||||
|
||||
.. image:: https://travis-ci.org/kjd/idna.svg?branch=master
|
||||
:target: https://travis-ci.org/kjd/idna
|
||||
|
||||
Platform: UNKNOWN
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: Intended Audience :: System Administrators
|
||||
Classifier: License :: OSI Approved :: BSD License
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 2
|
||||
Classifier: Programming Language :: Python :: 2.7
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.4
|
||||
Classifier: Programming Language :: Python :: 3.5
|
||||
Classifier: Programming Language :: Python :: 3.6
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: Implementation :: CPython
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Classifier: Topic :: Internet :: Name Service (DNS)
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Topic :: Utilities
|
||||
Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*
|
|
@ -1,211 +0,0 @@
|
|||
Internationalized Domain Names in Applications (IDNA)
|
||||
=====================================================
|
||||
|
||||
Support for the Internationalised Domain Names in Applications
|
||||
(IDNA) protocol as specified in `RFC 5891 <http://tools.ietf.org/html/rfc5891>`_.
|
||||
This is the latest version of the protocol and is sometimes referred to as
|
||||
“IDNA 2008”.
|
||||
|
||||
This library also provides support for Unicode Technical Standard 46,
|
||||
`Unicode IDNA Compatibility Processing <http://unicode.org/reports/tr46/>`_.
|
||||
|
||||
This acts as a suitable replacement for the “encodings.idna” module that
|
||||
comes with the Python standard library, but only supports the
|
||||
old, deprecated IDNA specification (`RFC 3490 <http://tools.ietf.org/html/rfc3490>`_).
|
||||
|
||||
Basic functions are simply executed:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
# Python 3
|
||||
>>> import idna
|
||||
>>> idna.encode('ドメイン.テスト')
|
||||
b'xn--eckwd4c7c.xn--zckzah'
|
||||
>>> print(idna.decode('xn--eckwd4c7c.xn--zckzah'))
|
||||
ドメイン.テスト
|
||||
|
||||
# Python 2
|
||||
>>> import idna
|
||||
>>> idna.encode(u'ドメイン.テスト')
|
||||
'xn--eckwd4c7c.xn--zckzah'
|
||||
>>> print idna.decode('xn--eckwd4c7c.xn--zckzah')
|
||||
ドメイン.テスト
|
||||
|
||||
Packages
|
||||
--------
|
||||
|
||||
The latest tagged release version is published in the PyPI repository:
|
||||
|
||||
.. image:: https://badge.fury.io/py/idna.svg
|
||||
:target: http://badge.fury.io/py/idna
|
||||
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
To install this library, you can use pip:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ pip install idna
|
||||
|
||||
Alternatively, you can install the package using the bundled setup script:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ python setup.py install
|
||||
|
||||
This library works with Python 2.7 and Python 3.4 or later.
|
||||
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
For typical usage, the ``encode`` and ``decode`` functions will take a domain
|
||||
name argument and perform a conversion to A-labels or U-labels respectively.
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
# Python 3
|
||||
>>> import idna
|
||||
>>> idna.encode('ドメイン.テスト')
|
||||
b'xn--eckwd4c7c.xn--zckzah'
|
||||
>>> print(idna.decode('xn--eckwd4c7c.xn--zckzah'))
|
||||
ドメイン.テスト
|
||||
|
||||
You may use the codec encoding and decoding methods using the
|
||||
``idna.codec`` module:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
# Python 2
|
||||
>>> import idna.codec
|
||||
>>> print u'домена.испытание'.encode('idna')
|
||||
xn--80ahd1agd.xn--80akhbyknj4f
|
||||
>>> print 'xn--80ahd1agd.xn--80akhbyknj4f'.decode('idna')
|
||||
домена.испытание
|
||||
|
||||
Conversions can be applied at a per-label basis using the ``ulabel`` or ``alabel``
|
||||
functions if necessary:
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
# Python 2
|
||||
>>> idna.alabel(u'测试')
|
||||
'xn--0zwm56d'
|
||||
|
||||
Compatibility Mapping (UTS #46)
|
||||
+++++++++++++++++++++++++++++++
|
||||
|
||||
As described in `RFC 5895 <http://tools.ietf.org/html/rfc5895>`_, the IDNA
|
||||
specification no longer normalizes input from different potential ways a user
|
||||
may input a domain name. This functionality, known as a “mapping”, is now
|
||||
considered by the specification to be a local user-interface issue distinct
|
||||
from IDNA conversion functionality.
|
||||
|
||||
This library provides one such mapping, that was developed by the Unicode
|
||||
Consortium. Known as `Unicode IDNA Compatibility Processing <http://unicode.org/reports/tr46/>`_,
|
||||
it provides for both a regular mapping for typical applications, as well as
|
||||
a transitional mapping to help migrate from older IDNA 2003 applications.
|
||||
|
||||
For example, “Königsgäßchen” is not a permissible label as *LATIN CAPITAL
|
||||
LETTER K* is not allowed (nor are capital letters in general). UTS 46 will
|
||||
convert this into lower case prior to applying the IDNA conversion.
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
# Python 3
|
||||
>>> import idna
|
||||
>>> idna.encode(u'Königsgäßchen')
|
||||
...
|
||||
idna.core.InvalidCodepoint: Codepoint U+004B at position 1 of 'Königsgäßchen' not allowed
|
||||
>>> idna.encode('Königsgäßchen', uts46=True)
|
||||
b'xn--knigsgchen-b4a3dun'
|
||||
>>> print(idna.decode('xn--knigsgchen-b4a3dun'))
|
||||
königsgäßchen
|
||||
|
||||
Transitional processing provides conversions to help transition from the older
|
||||
2003 standard to the current standard. For example, in the original IDNA
|
||||
specification, the *LATIN SMALL LETTER SHARP S* (ß) was converted into two
|
||||
*LATIN SMALL LETTER S* (ss), whereas in the current IDNA specification this
|
||||
conversion is not performed.
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
# Python 2
|
||||
>>> idna.encode(u'Königsgäßchen', uts46=True, transitional=True)
|
||||
'xn--knigsgsschen-lcb0w'
|
||||
|
||||
Implementors should use transitional processing with caution, only in rare
|
||||
cases where conversion from legacy labels to current labels must be performed
|
||||
(i.e. IDNA implementations that pre-date 2008). For typical applications
|
||||
that just need to convert labels, transitional processing is unlikely to be
|
||||
beneficial and could produce unexpected incompatible results.
|
||||
|
||||
``encodings.idna`` Compatibility
|
||||
++++++++++++++++++++++++++++++++
|
||||
|
||||
Function calls from the Python built-in ``encodings.idna`` module are
|
||||
mapped to their IDNA 2008 equivalents using the ``idna.compat`` module.
|
||||
Simply substitute the ``import`` clause in your code to refer to the
|
||||
new module name.
|
||||
|
||||
Exceptions
|
||||
----------
|
||||
|
||||
All errors raised during the conversion following the specification should
|
||||
raise an exception derived from the ``idna.IDNAError`` base class.
|
||||
|
||||
More specific exceptions that may be generated as ``idna.IDNABidiError``
|
||||
when the error reflects an illegal combination of left-to-right and right-to-left
|
||||
characters in a label; ``idna.InvalidCodepoint`` when a specific codepoint is
|
||||
an illegal character in an IDN label (i.e. INVALID); and ``idna.InvalidCodepointContext``
|
||||
when the codepoint is illegal based on its positional context (i.e. it is CONTEXTO
|
||||
or CONTEXTJ but the contextual requirements are not satisfied.)
|
||||
|
||||
Building and Diagnostics
|
||||
------------------------
|
||||
|
||||
The IDNA and UTS 46 functionality relies upon pre-calculated lookup tables for
|
||||
performance. These tables are derived from computing against eligibility criteria
|
||||
in the respective standards. These tables are computed using the command-line
|
||||
script ``tools/idna-data``.
|
||||
|
||||
This tool will fetch relevant tables from the Unicode Consortium and perform the
|
||||
required calculations to identify eligibility. It has three main modes:
|
||||
|
||||
* ``idna-data make-libdata``. Generates ``idnadata.py`` and ``uts46data.py``,
|
||||
the pre-calculated lookup tables using for IDNA and UTS 46 conversions. Implementors
|
||||
who wish to track this library against a different Unicode version may use this tool
|
||||
to manually generate a different version of the ``idnadata.py`` and ``uts46data.py``
|
||||
files.
|
||||
|
||||
* ``idna-data make-table``. Generate a table of the IDNA disposition
|
||||
(e.g. PVALID, CONTEXTJ, CONTEXTO) in the format found in Appendix B.1 of RFC
|
||||
5892 and the pre-computed tables published by `IANA <http://iana.org/>`_.
|
||||
|
||||
* ``idna-data U+0061``. Prints debugging output on the various properties
|
||||
associated with an individual Unicode codepoint (in this case, U+0061), that are
|
||||
used to assess the IDNA and UTS 46 status of a codepoint. This is helpful in debugging
|
||||
or analysis.
|
||||
|
||||
The tool accepts a number of arguments, described using ``idna-data -h``. Most notably,
|
||||
the ``--version`` argument allows the specification of the version of Unicode to use
|
||||
in computing the table data. For example, ``idna-data --version 9.0.0 make-libdata``
|
||||
will generate library data against Unicode 9.0.0.
|
||||
|
||||
Note that this script requires Python 3, but all generated library data will work
|
||||
in Python 2.7.
|
||||
|
||||
|
||||
Testing
|
||||
-------
|
||||
|
||||
The library has a test suite based on each rule of the IDNA specification, as
|
||||
well as tests that are provided as part of the Unicode Technical Standard 46,
|
||||
`Unicode IDNA Compatibility Processing <http://unicode.org/reports/tr46/>`_.
|
||||
|
||||
The tests are run automatically on each commit at Travis CI:
|
||||
|
||||
.. image:: https://travis-ci.org/kjd/idna.svg?branch=master
|
||||
:target: https://travis-ci.org/kjd/idna
|
|
@ -1,2 +0,0 @@
|
|||
from .package_data import __version__
|
||||
from .core import *
|
|
@ -1,118 +0,0 @@
|
|||
from .core import encode, decode, alabel, ulabel, IDNAError
|
||||
import codecs
|
||||
import re
|
||||
|
||||
_unicode_dots_re = re.compile(u'[\u002e\u3002\uff0e\uff61]')
|
||||
|
||||
class Codec(codecs.Codec):
|
||||
|
||||
def encode(self, data, errors='strict'):
|
||||
|
||||
if errors != 'strict':
|
||||
raise IDNAError("Unsupported error handling \"{0}\"".format(errors))
|
||||
|
||||
if not data:
|
||||
return "", 0
|
||||
|
||||
return encode(data), len(data)
|
||||
|
||||
def decode(self, data, errors='strict'):
|
||||
|
||||
if errors != 'strict':
|
||||
raise IDNAError("Unsupported error handling \"{0}\"".format(errors))
|
||||
|
||||
if not data:
|
||||
return u"", 0
|
||||
|
||||
return decode(data), len(data)
|
||||
|
||||
class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
|
||||
def _buffer_encode(self, data, errors, final):
|
||||
if errors != 'strict':
|
||||
raise IDNAError("Unsupported error handling \"{0}\"".format(errors))
|
||||
|
||||
if not data:
|
||||
return ("", 0)
|
||||
|
||||
labels = _unicode_dots_re.split(data)
|
||||
trailing_dot = u''
|
||||
if labels:
|
||||
if not labels[-1]:
|
||||
trailing_dot = '.'
|
||||
del labels[-1]
|
||||
elif not final:
|
||||
# Keep potentially unfinished label until the next call
|
||||
del labels[-1]
|
||||
if labels:
|
||||
trailing_dot = '.'
|
||||
|
||||
result = []
|
||||
size = 0
|
||||
for label in labels:
|
||||
result.append(alabel(label))
|
||||
if size:
|
||||
size += 1
|
||||
size += len(label)
|
||||
|
||||
# Join with U+002E
|
||||
result = ".".join(result) + trailing_dot
|
||||
size += len(trailing_dot)
|
||||
return (result, size)
|
||||
|
||||
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||
def _buffer_decode(self, data, errors, final):
|
||||
if errors != 'strict':
|
||||
raise IDNAError("Unsupported error handling \"{0}\"".format(errors))
|
||||
|
||||
if not data:
|
||||
return (u"", 0)
|
||||
|
||||
# IDNA allows decoding to operate on Unicode strings, too.
|
||||
if isinstance(data, unicode):
|
||||
labels = _unicode_dots_re.split(data)
|
||||
else:
|
||||
# Must be ASCII string
|
||||
data = str(data)
|
||||
unicode(data, "ascii")
|
||||
labels = data.split(".")
|
||||
|
||||
trailing_dot = u''
|
||||
if labels:
|
||||
if not labels[-1]:
|
||||
trailing_dot = u'.'
|
||||
del labels[-1]
|
||||
elif not final:
|
||||
# Keep potentially unfinished label until the next call
|
||||
del labels[-1]
|
||||
if labels:
|
||||
trailing_dot = u'.'
|
||||
|
||||
result = []
|
||||
size = 0
|
||||
for label in labels:
|
||||
result.append(ulabel(label))
|
||||
if size:
|
||||
size += 1
|
||||
size += len(label)
|
||||
|
||||
result = u".".join(result) + trailing_dot
|
||||
size += len(trailing_dot)
|
||||
return (result, size)
|
||||
|
||||
|
||||
class StreamWriter(Codec, codecs.StreamWriter):
|
||||
pass
|
||||
|
||||
class StreamReader(Codec, codecs.StreamReader):
|
||||
pass
|
||||
|
||||
def getregentry():
|
||||
return codecs.CodecInfo(
|
||||
name='idna',
|
||||
encode=Codec().encode,
|
||||
decode=Codec().decode,
|
||||
incrementalencoder=IncrementalEncoder,
|
||||
incrementaldecoder=IncrementalDecoder,
|
||||
streamwriter=StreamWriter,
|
||||
streamreader=StreamReader,
|
||||
)
|
|
@ -1,12 +0,0 @@
|
|||
from .core import *
|
||||
from .codec import *
|
||||
|
||||
def ToASCII(label):
|
||||
return encode(label)
|
||||
|
||||
def ToUnicode(label):
|
||||
return decode(label)
|
||||
|
||||
def nameprep(s):
|
||||
raise NotImplementedError("IDNA 2008 does not utilise nameprep protocol")
|
||||
|
|
@ -1,400 +0,0 @@
|
|||
from . import idnadata
|
||||
import bisect
|
||||
import unicodedata
|
||||
import re
|
||||
import sys
|
||||
from .intranges import intranges_contain
|
||||
|
||||
_virama_combining_class = 9
|
||||
_alabel_prefix = b'xn--'
|
||||
_unicode_dots_re = re.compile(u'[\u002e\u3002\uff0e\uff61]')
|
||||
|
||||
if sys.version_info[0] >= 3:
|
||||
unicode = str
|
||||
unichr = chr
|
||||
|
||||
class IDNAError(UnicodeError):
|
||||
""" Base exception for all IDNA-encoding related problems """
|
||||
pass
|
||||
|
||||
|
||||
class IDNABidiError(IDNAError):
|
||||
""" Exception when bidirectional requirements are not satisfied """
|
||||
pass
|
||||
|
||||
|
||||
class InvalidCodepoint(IDNAError):
|
||||
""" Exception when a disallowed or unallocated codepoint is used """
|
||||
pass
|
||||
|
||||
|
||||
class InvalidCodepointContext(IDNAError):
|
||||
""" Exception when the codepoint is not valid in the context it is used """
|
||||
pass
|
||||
|
||||
|
||||
def _combining_class(cp):
|
||||
v = unicodedata.combining(unichr(cp))
|
||||
if v == 0:
|
||||
if not unicodedata.name(unichr(cp)):
|
||||
raise ValueError("Unknown character in unicodedata")
|
||||
return v
|
||||
|
||||
def _is_script(cp, script):
|
||||
return intranges_contain(ord(cp), idnadata.scripts[script])
|
||||
|
||||
def _punycode(s):
|
||||
return s.encode('punycode')
|
||||
|
||||
def _unot(s):
|
||||
return 'U+{0:04X}'.format(s)
|
||||
|
||||
|
||||
def valid_label_length(label):
|
||||
|
||||
if len(label) > 63:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def valid_string_length(label, trailing_dot):
|
||||
|
||||
if len(label) > (254 if trailing_dot else 253):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def check_bidi(label, check_ltr=False):
|
||||
|
||||
# Bidi rules should only be applied if string contains RTL characters
|
||||
bidi_label = False
|
||||
for (idx, cp) in enumerate(label, 1):
|
||||
direction = unicodedata.bidirectional(cp)
|
||||
if direction == '':
|
||||
# String likely comes from a newer version of Unicode
|
||||
raise IDNABidiError('Unknown directionality in label {0} at position {1}'.format(repr(label), idx))
|
||||
if direction in ['R', 'AL', 'AN']:
|
||||
bidi_label = True
|
||||
if not bidi_label and not check_ltr:
|
||||
return True
|
||||
|
||||
# Bidi rule 1
|
||||
direction = unicodedata.bidirectional(label[0])
|
||||
if direction in ['R', 'AL']:
|
||||
rtl = True
|
||||
elif direction == 'L':
|
||||
rtl = False
|
||||
else:
|
||||
raise IDNABidiError('First codepoint in label {0} must be directionality L, R or AL'.format(repr(label)))
|
||||
|
||||
valid_ending = False
|
||||
number_type = False
|
||||
for (idx, cp) in enumerate(label, 1):
|
||||
direction = unicodedata.bidirectional(cp)
|
||||
|
||||
if rtl:
|
||||
# Bidi rule 2
|
||||
if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
|
||||
raise IDNABidiError('Invalid direction for codepoint at position {0} in a right-to-left label'.format(idx))
|
||||
# Bidi rule 3
|
||||
if direction in ['R', 'AL', 'EN', 'AN']:
|
||||
valid_ending = True
|
||||
elif direction != 'NSM':
|
||||
valid_ending = False
|
||||
# Bidi rule 4
|
||||
if direction in ['AN', 'EN']:
|
||||
if not number_type:
|
||||
number_type = direction
|
||||
else:
|
||||
if number_type != direction:
|
||||
raise IDNABidiError('Can not mix numeral types in a right-to-left label')
|
||||
else:
|
||||
# Bidi rule 5
|
||||
if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
|
||||
raise IDNABidiError('Invalid direction for codepoint at position {0} in a left-to-right label'.format(idx))
|
||||
# Bidi rule 6
|
||||
if direction in ['L', 'EN']:
|
||||
valid_ending = True
|
||||
elif direction != 'NSM':
|
||||
valid_ending = False
|
||||
|
||||
if not valid_ending:
|
||||
raise IDNABidiError('Label ends with illegal codepoint directionality')
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def check_initial_combiner(label):
|
||||
|
||||
if unicodedata.category(label[0])[0] == 'M':
|
||||
raise IDNAError('Label begins with an illegal combining character')
|
||||
return True
|
||||
|
||||
|
||||
def check_hyphen_ok(label):
|
||||
|
||||
if label[2:4] == '--':
|
||||
raise IDNAError('Label has disallowed hyphens in 3rd and 4th position')
|
||||
if label[0] == '-' or label[-1] == '-':
|
||||
raise IDNAError('Label must not start or end with a hyphen')
|
||||
return True
|
||||
|
||||
|
||||
def check_nfc(label):
|
||||
|
||||
if unicodedata.normalize('NFC', label) != label:
|
||||
raise IDNAError('Label must be in Normalization Form C')
|
||||
|
||||
|
||||
def valid_contextj(label, pos):
|
||||
|
||||
cp_value = ord(label[pos])
|
||||
|
||||
if cp_value == 0x200c:
|
||||
|
||||
if pos > 0:
|
||||
if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
|
||||
return True
|
||||
|
||||
ok = False
|
||||
for i in range(pos-1, -1, -1):
|
||||
joining_type = idnadata.joining_types.get(ord(label[i]))
|
||||
if joining_type == ord('T'):
|
||||
continue
|
||||
if joining_type in [ord('L'), ord('D')]:
|
||||
ok = True
|
||||
break
|
||||
|
||||
if not ok:
|
||||
return False
|
||||
|
||||
ok = False
|
||||
for i in range(pos+1, len(label)):
|
||||
joining_type = idnadata.joining_types.get(ord(label[i]))
|
||||
if joining_type == ord('T'):
|
||||
continue
|
||||
if joining_type in [ord('R'), ord('D')]:
|
||||
ok = True
|
||||
break
|
||||
return ok
|
||||
|
||||
if cp_value == 0x200d:
|
||||
|
||||
if pos > 0:
|
||||
if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
|
||||
return True
|
||||
return False
|
||||
|
||||
else:
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def valid_contexto(label, pos, exception=False):
|
||||
|
||||
cp_value = ord(label[pos])
|
||||
|
||||
if cp_value == 0x00b7:
|
||||
if 0 < pos < len(label)-1:
|
||||
if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c:
|
||||
return True
|
||||
return False
|
||||
|
||||
elif cp_value == 0x0375:
|
||||
if pos < len(label)-1 and len(label) > 1:
|
||||
return _is_script(label[pos + 1], 'Greek')
|
||||
return False
|
||||
|
||||
elif cp_value == 0x05f3 or cp_value == 0x05f4:
|
||||
if pos > 0:
|
||||
return _is_script(label[pos - 1], 'Hebrew')
|
||||
return False
|
||||
|
||||
elif cp_value == 0x30fb:
|
||||
for cp in label:
|
||||
if cp == u'\u30fb':
|
||||
continue
|
||||
if _is_script(cp, 'Hiragana') or _is_script(cp, 'Katakana') or _is_script(cp, 'Han'):
|
||||
return True
|
||||
return False
|
||||
|
||||
elif 0x660 <= cp_value <= 0x669:
|
||||
for cp in label:
|
||||
if 0x6f0 <= ord(cp) <= 0x06f9:
|
||||
return False
|
||||
return True
|
||||
|
||||
elif 0x6f0 <= cp_value <= 0x6f9:
|
||||
for cp in label:
|
||||
if 0x660 <= ord(cp) <= 0x0669:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def check_label(label):
|
||||
|
||||
if isinstance(label, (bytes, bytearray)):
|
||||
label = label.decode('utf-8')
|
||||
if len(label) == 0:
|
||||
raise IDNAError('Empty Label')
|
||||
|
||||
check_nfc(label)
|
||||
check_hyphen_ok(label)
|
||||
check_initial_combiner(label)
|
||||
|
||||
for (pos, cp) in enumerate(label):
|
||||
cp_value = ord(cp)
|
||||
if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']):
|
||||
continue
|
||||
elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']):
|
||||
try:
|
||||
if not valid_contextj(label, pos):
|
||||
raise InvalidCodepointContext('Joiner {0} not allowed at position {1} in {2}'.format(
|
||||
_unot(cp_value), pos+1, repr(label)))
|
||||
except ValueError:
|
||||
raise IDNAError('Unknown codepoint adjacent to joiner {0} at position {1} in {2}'.format(
|
||||
_unot(cp_value), pos+1, repr(label)))
|
||||
elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']):
|
||||
if not valid_contexto(label, pos):
|
||||
raise InvalidCodepointContext('Codepoint {0} not allowed at position {1} in {2}'.format(_unot(cp_value), pos+1, repr(label)))
|
||||
else:
|
||||
raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
|
||||
|
||||
check_bidi(label)
|
||||
|
||||
|
||||
def alabel(label):
|
||||
|
||||
try:
|
||||
label = label.encode('ascii')
|
||||
ulabel(label)
|
||||
if not valid_label_length(label):
|
||||
raise IDNAError('Label too long')
|
||||
return label
|
||||
except UnicodeEncodeError:
|
||||
pass
|
||||
|
||||
if not label:
|
||||
raise IDNAError('No Input')
|
||||
|
||||
label = unicode(label)
|
||||
check_label(label)
|
||||
label = _punycode(label)
|
||||
label = _alabel_prefix + label
|
||||
|
||||
if not valid_label_length(label):
|
||||
raise IDNAError('Label too long')
|
||||
|
||||
return label
|
||||
|
||||
|
||||
def ulabel(label):
|
||||
|
||||
if not isinstance(label, (bytes, bytearray)):
|
||||
try:
|
||||
label = label.encode('ascii')
|
||||
except UnicodeEncodeError:
|
||||
check_label(label)
|
||||
return label
|
||||
|
||||
label = label.lower()
|
||||
if label.startswith(_alabel_prefix):
|
||||
label = label[len(_alabel_prefix):]
|
||||
if not label:
|
||||
raise IDNAError('Malformed A-label, no Punycode eligible content found')
|
||||
if label.decode('ascii')[-1] == '-':
|
||||
raise IDNAError('A-label must not end with a hyphen')
|
||||
else:
|
||||
check_label(label)
|
||||
return label.decode('ascii')
|
||||
|
||||
label = label.decode('punycode')
|
||||
check_label(label)
|
||||
return label
|
||||
|
||||
|
||||
def uts46_remap(domain, std3_rules=True, transitional=False):
|
||||
"""Re-map the characters in the string according to UTS46 processing."""
|
||||
from .uts46data import uts46data
|
||||
output = u""
|
||||
try:
|
||||
for pos, char in enumerate(domain):
|
||||
code_point = ord(char)
|
||||
uts46row = uts46data[code_point if code_point < 256 else
|
||||
bisect.bisect_left(uts46data, (code_point, "Z")) - 1]
|
||||
status = uts46row[1]
|
||||
replacement = uts46row[2] if len(uts46row) == 3 else None
|
||||
if (status == "V" or
|
||||
(status == "D" and not transitional) or
|
||||
(status == "3" and not std3_rules and replacement is None)):
|
||||
output += char
|
||||
elif replacement is not None and (status == "M" or
|
||||
(status == "3" and not std3_rules) or
|
||||
(status == "D" and transitional)):
|
||||
output += replacement
|
||||
elif status != "I":
|
||||
raise IndexError()
|
||||
return unicodedata.normalize("NFC", output)
|
||||
except IndexError:
|
||||
raise InvalidCodepoint(
|
||||
"Codepoint {0} not allowed at position {1} in {2}".format(
|
||||
_unot(code_point), pos + 1, repr(domain)))
|
||||
|
||||
|
||||
def encode(s, strict=False, uts46=False, std3_rules=False, transitional=False):
|
||||
|
||||
if isinstance(s, (bytes, bytearray)):
|
||||
s = s.decode("ascii")
|
||||
if uts46:
|
||||
s = uts46_remap(s, std3_rules, transitional)
|
||||
trailing_dot = False
|
||||
result = []
|
||||
if strict:
|
||||
labels = s.split('.')
|
||||
else:
|
||||
labels = _unicode_dots_re.split(s)
|
||||
if not labels or labels == ['']:
|
||||
raise IDNAError('Empty domain')
|
||||
if labels[-1] == '':
|
||||
del labels[-1]
|
||||
trailing_dot = True
|
||||
for label in labels:
|
||||
s = alabel(label)
|
||||
if s:
|
||||
result.append(s)
|
||||
else:
|
||||
raise IDNAError('Empty label')
|
||||
if trailing_dot:
|
||||
result.append(b'')
|
||||
s = b'.'.join(result)
|
||||
if not valid_string_length(s, trailing_dot):
|
||||
raise IDNAError('Domain too long')
|
||||
return s
|
||||
|
||||
|
||||
def decode(s, strict=False, uts46=False, std3_rules=False):
|
||||
|
||||
if isinstance(s, (bytes, bytearray)):
|
||||
s = s.decode("ascii")
|
||||
if uts46:
|
||||
s = uts46_remap(s, std3_rules, False)
|
||||
trailing_dot = False
|
||||
result = []
|
||||
if not strict:
|
||||
labels = _unicode_dots_re.split(s)
|
||||
else:
|
||||
labels = s.split(u'.')
|
||||
if not labels or labels == ['']:
|
||||
raise IDNAError('Empty domain')
|
||||
if not labels[-1]:
|
||||
del labels[-1]
|
||||
trailing_dot = True
|
||||
for label in labels:
|
||||
s = ulabel(label)
|
||||
if s:
|
||||
result.append(s)
|
||||
else:
|
||||
raise IDNAError('Empty label')
|
||||
if trailing_dot:
|
||||
result.append(u'')
|
||||
return u'.'.join(result)
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,53 +0,0 @@
|
|||
"""
|
||||
Given a list of integers, made up of (hopefully) a small number of long runs
|
||||
of consecutive integers, compute a representation of the form
|
||||
((start1, end1), (start2, end2) ...). Then answer the question "was x present
|
||||
in the original list?" in time O(log(# runs)).
|
||||
"""
|
||||
|
||||
import bisect
|
||||
|
||||
def intranges_from_list(list_):
|
||||
"""Represent a list of integers as a sequence of ranges:
|
||||
((start_0, end_0), (start_1, end_1), ...), such that the original
|
||||
integers are exactly those x such that start_i <= x < end_i for some i.
|
||||
|
||||
Ranges are encoded as single integers (start << 32 | end), not as tuples.
|
||||
"""
|
||||
|
||||
sorted_list = sorted(list_)
|
||||
ranges = []
|
||||
last_write = -1
|
||||
for i in range(len(sorted_list)):
|
||||
if i+1 < len(sorted_list):
|
||||
if sorted_list[i] == sorted_list[i+1]-1:
|
||||
continue
|
||||
current_range = sorted_list[last_write+1:i+1]
|
||||
ranges.append(_encode_range(current_range[0], current_range[-1] + 1))
|
||||
last_write = i
|
||||
|
||||
return tuple(ranges)
|
||||
|
||||
def _encode_range(start, end):
|
||||
return (start << 32) | end
|
||||
|
||||
def _decode_range(r):
|
||||
return (r >> 32), (r & ((1 << 32) - 1))
|
||||
|
||||
|
||||
def intranges_contain(int_, ranges):
|
||||
"""Determine if `int_` falls into one of the ranges in `ranges`."""
|
||||
tuple_ = _encode_range(int_, 0)
|
||||
pos = bisect.bisect_left(ranges, tuple_)
|
||||
# we could be immediately ahead of a tuple (start, end)
|
||||
# with start < int_ <= end
|
||||
if pos > 0:
|
||||
left, right = _decode_range(ranges[pos-1])
|
||||
if left <= int_ < right:
|
||||
return True
|
||||
# or we could be immediately behind a tuple (int_, end)
|
||||
if pos < len(ranges):
|
||||
left, _ = _decode_range(ranges[pos])
|
||||
if left == int_:
|
||||
return True
|
||||
return False
|
|
@ -1,2 +0,0 @@
|
|||
__version__ = '2.10'
|
||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,7 +0,0 @@
|
|||
[bdist_wheel]
|
||||
universal = 1
|
||||
|
||||
[egg_info]
|
||||
tag_build =
|
||||
tag_date = 0
|
||||
|
|
@ -1,59 +0,0 @@
|
|||
"""
|
||||
A library to support the Internationalised Domain Names in Applications
|
||||
(IDNA) protocol as specified in RFC 5890 et.al. This new methodology,
|
||||
known as IDNA 2008, can generate materially different results to the
|
||||
previous standard. The library can act as a drop-in replacement for
|
||||
the "encodings.idna" module.
|
||||
"""
|
||||
|
||||
import io, sys
|
||||
from setuptools import setup
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
python_version = sys.version_info[:2]
|
||||
if python_version < (2,7):
|
||||
raise SystemExit("Sorry, Python 2.7 or newer required")
|
||||
|
||||
package_data = {}
|
||||
exec(open('idna/package_data.py').read(), package_data)
|
||||
|
||||
arguments = {
|
||||
'name': 'idna',
|
||||
'packages': ['idna'],
|
||||
'version': package_data['__version__'],
|
||||
'description': 'Internationalized Domain Names in Applications (IDNA)',
|
||||
'long_description': io.open("README.rst", encoding="UTF-8").read(),
|
||||
'author': 'Kim Davies',
|
||||
'author_email': 'kim@cynosure.com.au',
|
||||
'license': 'BSD-like',
|
||||
'url': 'https://github.com/kjd/idna',
|
||||
'classifiers': [
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Intended Audience :: Developers',
|
||||
'Intended Audience :: System Administrators',
|
||||
'License :: OSI Approved :: BSD License',
|
||||
'Operating System :: OS Independent',
|
||||
'Programming Language :: Python',
|
||||
'Programming Language :: Python :: 2',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.4',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: Implementation :: CPython',
|
||||
'Programming Language :: Python :: Implementation :: PyPy',
|
||||
'Topic :: Internet :: Name Service (DNS)',
|
||||
'Topic :: Software Development :: Libraries :: Python Modules',
|
||||
'Topic :: Utilities',
|
||||
],
|
||||
'python_requires': '>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*',
|
||||
}
|
||||
|
||||
setup(**arguments)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -1,670 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import argparse, collections, datetime, os, re, sys, unicodedata
|
||||
from urllib.request import urlopen
|
||||
from intranges import intranges_from_list
|
||||
|
||||
if sys.version_info[0] < 3:
|
||||
print("Only Python 3 supported.")
|
||||
sys.exit(2)
|
||||
|
||||
PREFERRED_VERSION = '12.1.0'
|
||||
UCD_URL = 'http://www.unicode.org/Public/{version}/ucd/{filename}'
|
||||
UTS46_URL = 'http://www.unicode.org/Public/idna/{version}/{filename}'
|
||||
|
||||
DEFAULT_CACHE_DIR = '~/.cache/unidata'
|
||||
|
||||
# Scripts affected by IDNA contextual rules
|
||||
SCRIPT_WHITELIST = sorted(['Greek', 'Han', 'Hebrew', 'Hiragana', 'Katakana'])
|
||||
|
||||
# Used to piece apart UTS#46 data for Jython compatibility
|
||||
UTS46_SEGMENT_SIZE = 100
|
||||
|
||||
UTS46_STATUSES = {
|
||||
"valid": ("V", False),
|
||||
"ignored": ("I", False),
|
||||
"mapped": ("M", True),
|
||||
"deviation": ("D", True),
|
||||
"disallowed": ("X", False),
|
||||
"disallowed_STD3_valid": ("3", False),
|
||||
"disallowed_STD3_mapped": ("3", True)
|
||||
}
|
||||
|
||||
# Exceptions are manually assigned in Section 2.6 of RFC 5892.
|
||||
exceptions = {
|
||||
0x00DF: 'PVALID', # LATIN SMALL LETTER SHARP S
|
||||
0x03C2: 'PVALID', # GREEK SMALL LETTER FINAL SIGMA
|
||||
0x06FD: 'PVALID', # ARABIC SIGN SINDHI AMPERSAND
|
||||
0x06FE: 'PVALID', # ARABIC SIGN SINDHI POSTPOSITION MEN
|
||||
0x0F0B: 'PVALID', # TIBETAN MARK INTERSYLLABIC TSHEG
|
||||
0x3007: 'PVALID', # IDEOGRAPHIC NUMBER ZERO
|
||||
0x00B7: 'CONTEXTO', # MIDDLE DOT
|
||||
0x0375: 'CONTEXTO', # GREEK LOWER NUMERAL SIGN (KERAIA)
|
||||
0x05F3: 'CONTEXTO', # HEBREW PUNCTUATION GERESH
|
||||
0x05F4: 'CONTEXTO', # HEBREW PUNCTUATION GERSHAYIM
|
||||
0x30FB: 'CONTEXTO', # KATAKANA MIDDLE DOT
|
||||
0x0660: 'CONTEXTO', # ARABIC-INDIC DIGIT ZERO
|
||||
0x0661: 'CONTEXTO', # ARABIC-INDIC DIGIT ONE
|
||||
0x0662: 'CONTEXTO', # ARABIC-INDIC DIGIT TWO
|
||||
0x0663: 'CONTEXTO', # ARABIC-INDIC DIGIT THREE
|
||||
0x0664: 'CONTEXTO', # ARABIC-INDIC DIGIT FOUR
|
||||
0x0665: 'CONTEXTO', # ARABIC-INDIC DIGIT FIVE
|
||||
0x0666: 'CONTEXTO', # ARABIC-INDIC DIGIT SIX
|
||||
0x0667: 'CONTEXTO', # ARABIC-INDIC DIGIT SEVEN
|
||||
0x0668: 'CONTEXTO', # ARABIC-INDIC DIGIT EIGHT
|
||||
0x0669: 'CONTEXTO', # ARABIC-INDIC DIGIT NINE
|
||||
0x06F0: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT ZERO
|
||||
0x06F1: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT ONE
|
||||
0x06F2: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT TWO
|
||||
0x06F3: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT THREE
|
||||
0x06F4: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT FOUR
|
||||
0x06F5: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT FIVE
|
||||
0x06F6: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT SIX
|
||||
0x06F7: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT SEVEN
|
||||
0x06F8: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT EIGHT
|
||||
0x06F9: 'CONTEXTO', # EXTENDED ARABIC-INDIC DIGIT NINE
|
||||
0x0640: 'DISALLOWED', # ARABIC TATWEEL
|
||||
0x07FA: 'DISALLOWED', # NKO LAJANYALAN
|
||||
0x302E: 'DISALLOWED', # HANGUL SINGLE DOT TONE MARK
|
||||
0x302F: 'DISALLOWED', # HANGUL DOUBLE DOT TONE MARK
|
||||
0x3031: 'DISALLOWED', # VERTICAL KANA REPEAT MARK
|
||||
0x3032: 'DISALLOWED', # VERTICAL KANA REPEAT WITH VOICED SOUND MARK
|
||||
0x3033: 'DISALLOWED', # VERTICAL KANA REPEAT MARK UPPER HALF
|
||||
0x3034: 'DISALLOWED', # VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HA
|
||||
0x3035: 'DISALLOWED', # VERTICAL KANA REPEAT MARK LOWER HALF
|
||||
0x303B: 'DISALLOWED', # VERTICAL IDEOGRAPHIC ITERATION MARK
|
||||
}
|
||||
backwardscompatible = {}
|
||||
|
||||
|
||||
def hexrange(start, end):
|
||||
return range(int(start, 16), int(end, 16) + 1)
|
||||
|
||||
def hexvalue(value):
|
||||
return int(value, 16)
|
||||
|
||||
|
||||
class UnicodeVersion(object):
|
||||
|
||||
def __init__(self, version):
|
||||
result = re.match('^(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)$', version)
|
||||
if result:
|
||||
self.major = int(result.group('major'))
|
||||
self.minor = int(result.group('minor'))
|
||||
self.patch = int(result.group('patch'))
|
||||
self.numerical = (self.major << 8) + (self.minor << 4) + self.patch
|
||||
self.latest = False
|
||||
elif version == 'latest':
|
||||
self.latest = True
|
||||
else:
|
||||
raise ValueError('Unrecognized Unicode version')
|
||||
|
||||
def __repr__(self, with_date=True):
|
||||
if self.latest:
|
||||
if with_date:
|
||||
return 'latest@{}'.format(datetime.datetime.now().strftime('%Y-%m-%d'))
|
||||
else:
|
||||
return 'latest'
|
||||
else:
|
||||
return "{}.{}.{}".format(self.major, self.minor, self.patch)
|
||||
|
||||
@property
|
||||
def tag(self):
|
||||
return self.__repr__(with_date=False)
|
||||
|
||||
def __gt__(self, other):
|
||||
if self.latest:
|
||||
return True
|
||||
return self.numerical > other.numerical
|
||||
|
||||
def __eq__(self, other):
|
||||
if self.latest:
|
||||
return False
|
||||
return self.numerical == other.numerical
|
||||
|
||||
|
||||
class UnicodeData(object):
|
||||
|
||||
def __init__(self, version, cache, args):
|
||||
self.version = UnicodeVersion(version)
|
||||
self.system_version = UnicodeVersion(unicodedata.unidata_version)
|
||||
self.source = args.source
|
||||
self.cache = cache
|
||||
self.max = 0
|
||||
|
||||
if self.system_version < self.version:
|
||||
print("Warning: Character stability not guaranteed as Python Unicode data {}"
|
||||
" older than requested {}".format(self.system_version, self.version))
|
||||
|
||||
self._load_unicodedata()
|
||||
self._load_proplist()
|
||||
self._load_derivedcoreprops()
|
||||
self._load_blocks()
|
||||
self._load_casefolding()
|
||||
self._load_hangulst()
|
||||
self._load_arabicshaping()
|
||||
self._load_scripts()
|
||||
self._load_uts46mapping()
|
||||
|
||||
def _load_unicodedata(self):
|
||||
|
||||
f_ud = self._ucdfile('UnicodeData.txt')
|
||||
self.ucd_data = {}
|
||||
range_begin = None
|
||||
for line in f_ud.splitlines():
|
||||
fields = line.split(';')
|
||||
value = int(fields[0], 16)
|
||||
start_marker = re.match('^<(?P<name>.*?), First>$', fields[1])
|
||||
end_marker = re.match('^<(?P<name>.*?), Last>$', fields[1])
|
||||
if start_marker:
|
||||
range_begin = value
|
||||
elif end_marker:
|
||||
for i in range(range_begin, value+1):
|
||||
fields[1] = '<{}>'.format(end_marker.group('name'))
|
||||
self.ucd_data[i] = fields[1:]
|
||||
range_begin = None
|
||||
else:
|
||||
self.ucd_data[value] = fields[1:]
|
||||
|
||||
def _load_proplist(self):
|
||||
|
||||
f_pl = self._ucdfile('PropList.txt')
|
||||
self.ucd_props = collections.defaultdict(list)
|
||||
for line in f_pl.splitlines():
|
||||
result = re.match(
|
||||
'^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<prop>\S+)\s*(|\#.*)$',
|
||||
line)
|
||||
if result:
|
||||
if result.group('end'):
|
||||
for i in hexrange(result.group('start'), result.group('end')):
|
||||
self.ucd_props[i].append(result.group('prop'))
|
||||
else:
|
||||
i = hexvalue(result.group('start'))
|
||||
self.ucd_props[i].append(result.group('prop'))
|
||||
|
||||
def _load_derivedcoreprops(self):
|
||||
|
||||
f_dcp = self._ucdfile('DerivedCoreProperties.txt')
|
||||
for line in f_dcp.splitlines():
|
||||
result = re.match(
|
||||
'^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<prop>\S+)\s*(|\#.*)$',
|
||||
line)
|
||||
if result:
|
||||
if result.group('end'):
|
||||
for i in hexrange(result.group('start'), result.group('end')):
|
||||
self.ucd_props[i].append(result.group('prop'))
|
||||
else:
|
||||
i = hexvalue(result.group('start'))
|
||||
self.ucd_props[i].append(result.group('prop'))
|
||||
|
||||
def _load_blocks(self):
|
||||
|
||||
self.ucd_block = {}
|
||||
f_b = self._ucdfile('Blocks.txt')
|
||||
for line in f_b.splitlines():
|
||||
result = re.match(
|
||||
'^(?P<start>[0-9A-F]{4,6})\.\.(?P<end>[0-9A-F]{4,6})\s*;\s*(?P<block>.*)\s*$',
|
||||
line)
|
||||
if result:
|
||||
for i in hexrange(result.group('start'), result.group('end')):
|
||||
self.ucd_block[i] = result.group('block')
|
||||
self.max = max(self.max, i)
|
||||
|
||||
def _load_casefolding(self):
|
||||
|
||||
self.ucd_cf = {}
|
||||
f_cf = self._ucdfile('CaseFolding.txt')
|
||||
for line in f_cf.splitlines():
|
||||
result = re.match(
|
||||
'^(?P<cp>[0-9A-F]{4,6})\s*;\s*(?P<type>\S+)\s*;\s*(?P<subst>[0-9A-F\s]+)\s*',
|
||||
line)
|
||||
if result:
|
||||
if result.group('type') in ('C', 'F'):
|
||||
self.ucd_cf[int(result.group('cp'), 16)] = \
|
||||
''.join([chr(int(x, 16)) for x in result.group('subst').split(' ')])
|
||||
|
||||
def _load_hangulst(self):
|
||||
|
||||
self.ucd_hst = {}
|
||||
f_hst = self._ucdfile('HangulSyllableType.txt')
|
||||
for line in f_hst.splitlines():
|
||||
result = re.match(
|
||||
'^(?P<start>[0-9A-F]{4,6})\.\.(?P<end>[0-9A-F]{4,6})\s*;\s*(?P<type>\S+)\s*(|\#.*)$',
|
||||
line)
|
||||
if result:
|
||||
for i in hexrange(result.group('start'), result.group('end')):
|
||||
self.ucd_hst[i] = result.group('type')
|
||||
|
||||
def _load_arabicshaping(self):
|
||||
|
||||
self.ucd_as = {}
|
||||
f_as = self._ucdfile('ArabicShaping.txt')
|
||||
for line in f_as.splitlines():
|
||||
result = re.match('^(?P<cp>[0-9A-F]{4,6})\s*;\s*.*?\s*;\s*(?P<jt>\S+)\s*;', line)
|
||||
if result:
|
||||
self.ucd_as[int(result.group('cp'), 16)] = result.group('jt')
|
||||
|
||||
def _load_scripts(self):
|
||||
|
||||
self.ucd_s = {}
|
||||
f_s = self._ucdfile('Scripts.txt')
|
||||
for line in f_s.splitlines():
|
||||
result = re.match(
|
||||
'^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<script>\S+)\s*(|\#.*)$',
|
||||
line)
|
||||
if result:
|
||||
if not result.group('script') in self.ucd_s:
|
||||
self.ucd_s[result.group('script')] = set()
|
||||
if result.group('end'):
|
||||
for i in hexrange(result.group('start'), result.group('end')):
|
||||
self.ucd_s[result.group('script')].add(i)
|
||||
else:
|
||||
i = hexvalue(result.group('start'))
|
||||
self.ucd_s[result.group('script')].add(i)
|
||||
|
||||
def _load_uts46mapping(self):
|
||||
|
||||
self.ucd_idnamt = {}
|
||||
f_idnamt = self._ucdfile('IdnaMappingTable.txt', urlbase=UTS46_URL)
|
||||
for line in f_idnamt.splitlines():
|
||||
result = re.match(
|
||||
'^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<fields>[^#]+)',
|
||||
line)
|
||||
if result:
|
||||
fields = [x.strip() for x in result.group('fields').split(';')]
|
||||
if result.group('end'):
|
||||
for i in hexrange(result.group('start'), result.group('end')):
|
||||
self.ucd_idnamt[i] = fields
|
||||
else:
|
||||
i = hexvalue(result.group('start'))
|
||||
self.ucd_idnamt[i] = fields
|
||||
|
||||
def _ucdfile(self, filename, urlbase=UCD_URL):
|
||||
if self.source:
|
||||
f = open("{}/{}".format(self.source, filename))
|
||||
return f.read()
|
||||
else:
|
||||
cache_file = None
|
||||
if self.cache:
|
||||
cache_file = os.path.expanduser("{}/{}/{}".format(
|
||||
self.cache, self.version.tag, filename))
|
||||
if os.path.isfile(cache_file):
|
||||
f = open(cache_file)
|
||||
return f.read()
|
||||
|
||||
version_path = self.version.tag
|
||||
if version_path == 'latest':
|
||||
version_path = 'UCD/latest'
|
||||
url = urlbase.format(
|
||||
version=version_path,
|
||||
filename=filename,
|
||||
)
|
||||
content = urlopen(url).read().decode('utf-8')
|
||||
|
||||
if cache_file:
|
||||
if not os.path.isdir(os.path.dirname(cache_file)):
|
||||
os.makedirs(os.path.dirname(cache_file))
|
||||
f = open(cache_file, 'wb')
|
||||
f.write(content.encode('utf-8'))
|
||||
f.close()
|
||||
|
||||
return str(content)
|
||||
|
||||
def codepoints(self):
|
||||
for i in range(0, self.max + 1):
|
||||
yield CodePoint(i, ucdata=self)
|
||||
|
||||
|
||||
class CodePoint:
|
||||
|
||||
def __init__(self, value=None, ucdata=None):
|
||||
self.value = value
|
||||
self.ucdata = ucdata
|
||||
|
||||
def _casefold(self, s):
|
||||
r = ''
|
||||
for c in s:
|
||||
r += self.ucdata.ucd_cf.get(ord(c), c)
|
||||
return r
|
||||
|
||||
@property
|
||||
def exception_value(self):
|
||||
return exceptions.get(self.value, False)
|
||||
|
||||
@property
|
||||
def compat_value(self):
|
||||
return backwardscompatible.get(self.value, False)
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
if self.value in self.ucdata.ucd_data:
|
||||
return self.ucdata.ucd_data[self.value][0]
|
||||
elif 'Noncharacter_Code_Point' in self.ucdata.ucd_props[self.value]:
|
||||
return '<noncharacter>'
|
||||
else:
|
||||
return '<reserved>'
|
||||
|
||||
@property
|
||||
def general_category(self):
|
||||
return self.ucdata.ucd_data.get(self.value, [None, None])[1]
|
||||
|
||||
@property
|
||||
def unassigned(self):
|
||||
return not ('Noncharacter_Code_Point' in self.ucdata.ucd_props[self.value] or \
|
||||
self.value in self.ucdata.ucd_data)
|
||||
|
||||
@property
|
||||
def ldh(self):
|
||||
if self.value == 0x002d or \
|
||||
self.value in range(0x0030, 0x0039+1) or \
|
||||
self.value in range(0x0061, 0x007a+1):
|
||||
return True
|
||||
return False
|
||||
|
||||
@property
|
||||
def join_control(self):
|
||||
return 'Join_Control' in self.ucdata.ucd_props[self.value]
|
||||
|
||||
@property
|
||||
def joining_type(self):
|
||||
return self.ucdata.ucd_as.get(self.value, None)
|
||||
|
||||
@property
|
||||
def char(self):
|
||||
return chr(self.value)
|
||||
|
||||
@property
|
||||
def nfkc_cf(self):
|
||||
return unicodedata.normalize('NFKC',
|
||||
self._casefold(unicodedata.normalize('NFKC', self.char)))
|
||||
|
||||
@property
|
||||
def unstable(self):
|
||||
return self.char != self.nfkc_cf
|
||||
|
||||
@property
|
||||
def in_ignorableproperties(self):
|
||||
for prop in ['Default_Ignorable_Code_Point', 'White_Space', 'Noncharacter_Code_Point']:
|
||||
if prop in self.ucdata.ucd_props[self.value]:
|
||||
return True
|
||||
return False
|
||||
|
||||
@property
|
||||
def in_ignorableblocks(self):
|
||||
return self.ucdata.ucd_block.get(self.value) in (
|
||||
'Combining Diacritical Marks for Symbols', 'Musical Symbols',
|
||||
'Ancient Greek Musical Notation'
|
||||
)
|
||||
|
||||
@property
|
||||
def oldhanguljamo(self):
|
||||
return self.ucdata.ucd_hst.get(self.value) in ('L', 'V', 'T')
|
||||
|
||||
@property
|
||||
def in_lettersdigits(self):
|
||||
return self.general_category in ('Ll', 'Lu', 'Lo', 'Nd', 'Lm', 'Mn', 'Mc')
|
||||
|
||||
@property
|
||||
def idna2008_status(self):
|
||||
if self.exception_value:
|
||||
return self.exception_value
|
||||
elif self.compat_value:
|
||||
return self.compat_value
|
||||
elif self.unassigned:
|
||||
return 'UNASSIGNED'
|
||||
elif self.ldh:
|
||||
return 'PVALID'
|
||||
elif self.join_control:
|
||||
return 'CONTEXTJ'
|
||||
elif self.unstable:
|
||||
return 'DISALLOWED'
|
||||
elif self.in_ignorableproperties:
|
||||
return 'DISALLOWED'
|
||||
elif self.in_ignorableblocks:
|
||||
return 'DISALLOWED'
|
||||
elif self.oldhanguljamo:
|
||||
return 'DISALLOWED'
|
||||
elif self.in_lettersdigits:
|
||||
return 'PVALID'
|
||||
else:
|
||||
return 'DISALLOWED'
|
||||
|
||||
@property
|
||||
def uts46_data(self):
|
||||
return self.ucdata.ucd_idnamt.get(self.value, None)
|
||||
|
||||
@property
|
||||
def uts46_status(self):
|
||||
return ' '.join(self.uts46_data)
|
||||
|
||||
|
||||
def diagnose_codepoint(codepoint, args, ucdata):
|
||||
|
||||
cp = CodePoint(codepoint, ucdata=ucdata)
|
||||
|
||||
print("U+{:04X}:".format(codepoint))
|
||||
print(" Name: {}".format(cp.name))
|
||||
print("1 Exceptions: {}".format(exceptions.get(codepoint, False)))
|
||||
print("2 Backwards Compat: {}".format(backwardscompatible.get(codepoint, False)))
|
||||
print("3 Unassigned: {}".format(cp.unassigned))
|
||||
print("4 LDH: {}".format(cp.ldh))
|
||||
print(" Properties: {}".format(" ".join(sorted(ucdata.ucd_props.get(codepoint, ['None'])))))
|
||||
print("5 .Join Control: {}".format(cp.join_control))
|
||||
print(" NFKC CF: {}".format(" ".join(["U+{:04X}".format(ord(x)) for x in cp.nfkc_cf])))
|
||||
print("6 .Unstable: {}".format(cp.unstable))
|
||||
print("7 .Ignorable Prop: {}".format(cp.in_ignorableproperties))
|
||||
print(" Block: {}".format(ucdata.ucd_block.get(codepoint, None)))
|
||||
print("8 .Ignorable Block: {}".format(cp.in_ignorableblocks))
|
||||
print(" Hangul Syll Type: {}".format(ucdata.ucd_hst.get(codepoint, None)))
|
||||
print("9 .Old Hangul Jamo: {}".format(cp.oldhanguljamo))
|
||||
print(" General Category: {}".format(cp.general_category))
|
||||
print("10 .Letters Digits: {}".format(cp.in_lettersdigits))
|
||||
print("== IDNA 2008: {}".format(cp.idna2008_status))
|
||||
print("== UTS 46: {}".format(cp.uts46_status))
|
||||
print("(Unicode {} [sys:{}])".format(ucdata.version, ucdata.system_version))
|
||||
|
||||
def ucdrange(start, end):
|
||||
if start == end:
|
||||
return ("{:04X}".format(start.value), start.name)
|
||||
else:
|
||||
return ("{:04X}..{:04X}".format(start.value, end.value),
|
||||
"{}..{}".format(start.name, end.name))
|
||||
|
||||
def optimised_list(d):
|
||||
yield '('
|
||||
for value in intranges_from_list(d):
|
||||
yield ' {},'.format(hex(value))
|
||||
yield ' ),'
|
||||
|
||||
def make_table(args, ucdata):
|
||||
|
||||
last_status = None
|
||||
cps = []
|
||||
table_data = []
|
||||
|
||||
for cp in ucdata.codepoints():
|
||||
status = cp.idna2008_status
|
||||
if (last_status and last_status != status):
|
||||
(values, description) = ucdrange(cps[0], cps[-1])
|
||||
table_data.append([values, last_status, description])
|
||||
cps = []
|
||||
last_status = status
|
||||
cps.append(cp)
|
||||
(values, description) = ucdrange(cps[0], cps[-1])
|
||||
table_data.append([values, last_status, description])
|
||||
|
||||
if args.dir:
|
||||
|
||||
f = open("{}/idna-table-{}.txt".format(args.dir, ucdata.version), 'wb')
|
||||
for row in table_data:
|
||||
f.write("{:12}; {:12}# {:.44}\n".format(*row).encode('ascii'))
|
||||
f.close()
|
||||
|
||||
else:
|
||||
|
||||
for row in table_data:
|
||||
print("{:12}; {:12}# {:.44}".format(*row))
|
||||
|
||||
def idna_libdata(ucdata):
|
||||
|
||||
yield "# This file is automatically generated by tools/idna-data\n"
|
||||
yield "__version__ = \"{}\"".format(ucdata.version)
|
||||
|
||||
#
|
||||
# Script classifications are used by some CONTEXTO rules in RFC 5891
|
||||
#
|
||||
yield "scripts = {"
|
||||
for script in SCRIPT_WHITELIST:
|
||||
prefix = " '{0}': ".format(script)
|
||||
for line in optimised_list(ucdata.ucd_s[script]):
|
||||
yield prefix + line
|
||||
prefix = ""
|
||||
yield "}"
|
||||
|
||||
#
|
||||
# Joining types are used by CONTEXTJ rule A.1
|
||||
#
|
||||
yield "joining_types = {"
|
||||
for cp in ucdata.codepoints():
|
||||
if cp.joining_type:
|
||||
yield " 0x{0:x}: {1},".format(cp.value, ord(cp.joining_type))
|
||||
yield "}"
|
||||
|
||||
#
|
||||
# These are the classification of codepoints into PVALID, CONTEXTO, CONTEXTJ, etc.
|
||||
#
|
||||
yield "codepoint_classes = {"
|
||||
classes = {}
|
||||
for cp in ucdata.codepoints():
|
||||
status = cp.idna2008_status
|
||||
if status in ('UNASSIGNED', 'DISALLOWED'):
|
||||
continue
|
||||
if not status in classes:
|
||||
classes[status] = set()
|
||||
classes[status].add(cp.value)
|
||||
for status in ['PVALID', 'CONTEXTJ', 'CONTEXTO']:
|
||||
prefix = " '{0}': ".format(status)
|
||||
for line in optimised_list(classes[status]):
|
||||
yield prefix + line
|
||||
prefix = ""
|
||||
yield "}"
|
||||
|
||||
def uts46_ranges(ucdata):
|
||||
|
||||
last = (None, None)
|
||||
for cp in ucdata.codepoints():
|
||||
fields = cp.uts46_data
|
||||
if not fields:
|
||||
continue
|
||||
status, mapping = UTS46_STATUSES[fields[0]]
|
||||
if mapping:
|
||||
mapping = "".join(chr(int(codepoint, 16)) for codepoint in fields[1].split())
|
||||
mapping = mapping.replace("\\", "\\\\").replace("'", "\\'")
|
||||
else:
|
||||
mapping = None
|
||||
if cp.value > 255 and (status, mapping) == last:
|
||||
continue
|
||||
last = (status, mapping)
|
||||
|
||||
if mapping is not None:
|
||||
yield "(0x{0:X}, '{1}', u'{2}')".format(cp.value, status, mapping)
|
||||
else:
|
||||
yield "(0x{0:X}, '{1}')".format(cp.value, status)
|
||||
|
||||
def uts46_libdata(ucdata):
|
||||
|
||||
yield "# This file is automatically generated by tools/idna-data"
|
||||
yield "# vim: set fileencoding=utf-8 :\n"
|
||||
yield '"""IDNA Mapping Table from UTS46."""\n\n'
|
||||
|
||||
yield "__version__ = \"{}\"".format(ucdata.version)
|
||||
|
||||
idx = -1
|
||||
for row in uts46_ranges(ucdata):
|
||||
idx += 1
|
||||
if idx % UTS46_SEGMENT_SIZE == 0:
|
||||
if idx != 0:
|
||||
yield " ]\n"
|
||||
yield "def _seg_{0}():\n return [".format(idx // UTS46_SEGMENT_SIZE)
|
||||
yield " {0},".format(row)
|
||||
yield " ]\n"
|
||||
|
||||
yield "uts46data = tuple("
|
||||
yield " _seg_0()"
|
||||
for i in range(1, idx // UTS46_SEGMENT_SIZE + 1):
|
||||
yield " + _seg_{0}()".format(i)
|
||||
yield ")"
|
||||
|
||||
def make_libdata(args, ucdata):
|
||||
|
||||
dest_dir = args.dir or '.'
|
||||
|
||||
target_filename = os.path.join(dest_dir, 'idnadata.py')
|
||||
with open(target_filename, 'wb') as target:
|
||||
for line in idna_libdata(ucdata):
|
||||
target.write((line + "\n").encode('utf-8'))
|
||||
|
||||
target_filename = os.path.join(dest_dir, 'uts46data.py')
|
||||
with open(target_filename, 'wb') as target:
|
||||
for line in uts46_libdata(ucdata):
|
||||
target.write((line + "\n").encode('utf-8'))
|
||||
|
||||
def arg_error(message, parser):
|
||||
|
||||
parser.print_usage()
|
||||
print('{}: error: {}'.format(sys.argv[0], message))
|
||||
sys.exit(2)
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Determine IDNA code-point validity data')
|
||||
parser.add_argument('action', type=str, default='preferred',
|
||||
help='Task to perform (make-libdata, make-tables, <codepoint>)')
|
||||
|
||||
parser.add_argument('--version', type=str, default='preferred',
|
||||
help='Unicode version to use (preferred, latest, <x.y.z>)')
|
||||
parser.add_argument('--source', type=str, default=None,
|
||||
help='Where to fetch Unicode data (file path)')
|
||||
parser.add_argument('--dir', type=str, default=None, help='Where to export the output')
|
||||
parser.add_argument('--cache', type=str, default=None, help='Where to cache Unicode data')
|
||||
parser.add_argument('--no-cache', action='store_true', help='Don\'t cache Unicode data')
|
||||
libdata = parser.add_argument_group('make-libdata', 'Make module data for Python IDNA library')
|
||||
|
||||
tables = parser.add_argument_group('make-table', 'Make IANA-style reference table')
|
||||
|
||||
codepoint = parser.add_argument_group('codepoint',
|
||||
'Display related data for given codepoint (e.g. U+0061)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.version == 'preferred':
|
||||
target_version = PREFERRED_VERSION
|
||||
else:
|
||||
target_version = args.version
|
||||
|
||||
if args.cache and args.no_cache:
|
||||
arg_error('I can\'t both --cache and --no-cache', parser)
|
||||
cache = args.cache or DEFAULT_CACHE_DIR
|
||||
if args.no_cache:
|
||||
cache = None
|
||||
|
||||
ucdata = UnicodeData(target_version, cache, args)
|
||||
|
||||
if args.action == 'make-table':
|
||||
make_table(args, ucdata)
|
||||
elif args.action == 'make-libdata':
|
||||
make_libdata(args, ucdata)
|
||||
else:
|
||||
result = re.match('^(?i)(U\+|)(?P<cp>[0-9A-F]{4,6})$', args.action)
|
||||
if result:
|
||||
codepoint = int(result.group('cp'), 16)
|
||||
diagnose_codepoint(codepoint, args, ucdata)
|
||||
sys.exit(0)
|
||||
arg_error('Don\'t recognize action or codepoint value', parser)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
|
||||
|
|
@ -1,53 +0,0 @@
|
|||
"""
|
||||
Given a list of integers, made up of (hopefully) a small number of long runs
|
||||
of consecutive integers, compute a representation of the form
|
||||
((start1, end1), (start2, end2) ...). Then answer the question "was x present
|
||||
in the original list?" in time O(log(# runs)).
|
||||
"""
|
||||
|
||||
import bisect
|
||||
|
||||
def intranges_from_list(list_):
|
||||
"""Represent a list of integers as a sequence of ranges:
|
||||
((start_0, end_0), (start_1, end_1), ...), such that the original
|
||||
integers are exactly those x such that start_i <= x < end_i for some i.
|
||||
|
||||
Ranges are encoded as single integers (start << 32 | end), not as tuples.
|
||||
"""
|
||||
|
||||
sorted_list = sorted(list_)
|
||||
ranges = []
|
||||
last_write = -1
|
||||
for i in range(len(sorted_list)):
|
||||
if i+1 < len(sorted_list):
|
||||
if sorted_list[i] == sorted_list[i+1]-1:
|
||||
continue
|
||||
current_range = sorted_list[last_write+1:i+1]
|
||||
ranges.append(_encode_range(current_range[0], current_range[-1] + 1))
|
||||
last_write = i
|
||||
|
||||
return tuple(ranges)
|
||||
|
||||
def _encode_range(start, end):
|
||||
return (start << 32) | end
|
||||
|
||||
def _decode_range(r):
|
||||
return (r >> 32), (r & ((1 << 32) - 1))
|
||||
|
||||
|
||||
def intranges_contain(int_, ranges):
|
||||
"""Determine if `int_` falls into one of the ranges in `ranges`."""
|
||||
tuple_ = _encode_range(int_, 0)
|
||||
pos = bisect.bisect_left(ranges, tuple_)
|
||||
# we could be immediately ahead of a tuple (start, end)
|
||||
# with start < int_ <= end
|
||||
if pos > 0:
|
||||
left, right = _decode_range(ranges[pos-1])
|
||||
if left <= int_ < right:
|
||||
return True
|
||||
# or we could be immediately behind a tuple (int_, end)
|
||||
if pos < len(ranges):
|
||||
left, _ = _decode_range(ranges[pos])
|
||||
if left == int_:
|
||||
return True
|
||||
return False
|
|
@ -1,10 +0,0 @@
|
|||
[run]
|
||||
omit =
|
||||
# leading `*/` for pytest-dev/pytest-cov#456
|
||||
*/.tox/*
|
||||
tests/*
|
||||
prepare/*
|
||||
*/_itertools.py
|
||||
|
||||
[report]
|
||||
show_missing = True
|
|
@ -1,15 +0,0 @@
|
|||
root = true
|
||||
|
||||
[*]
|
||||
charset = utf-8
|
||||
indent_style = tab
|
||||
indent_size = 4
|
||||
insert_final_newline = true
|
||||
end_of_line = lf
|
||||
|
||||
[*.py]
|
||||
indent_style = space
|
||||
|
||||
[*.{yml,yaml}]
|
||||
indent_style = space
|
||||
indent_size = 2
|
|
@ -1,10 +0,0 @@
|
|||
[flake8]
|
||||
max-line-length = 88
|
||||
|
||||
# jaraco/skeleton#34
|
||||
max-complexity = 10
|
||||
|
||||
extend-ignore =
|
||||
# Black creates whitespace before colon
|
||||
E203
|
||||
enable-extensions = U4
|
|
@ -1,27 +0,0 @@
|
|||
name: automerge
|
||||
on:
|
||||
pull_request:
|
||||
types:
|
||||
- labeled
|
||||
- unlabeled
|
||||
- synchronize
|
||||
- opened
|
||||
- edited
|
||||
- ready_for_review
|
||||
- reopened
|
||||
- unlocked
|
||||
pull_request_review:
|
||||
types:
|
||||
- submitted
|
||||
check_suite:
|
||||
types:
|
||||
- completed
|
||||
status: {}
|
||||
jobs:
|
||||
automerge:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: automerge
|
||||
uses: "pascalgn/automerge-action@v0.12.0"
|
||||
env:
|
||||
GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
|
|
@ -1,76 +0,0 @@
|
|||
name: tests
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
test:
|
||||
strategy:
|
||||
matrix:
|
||||
python: [3.6, 3.8, 3.9]
|
||||
platform: [ubuntu-latest, macos-latest, windows-latest]
|
||||
runs-on: ${{ matrix.platform }}
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python }}
|
||||
- name: Install tox
|
||||
run: |
|
||||
python -m pip install tox
|
||||
- name: Run tests
|
||||
run: tox
|
||||
|
||||
benchmark:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.9
|
||||
- name: Install tox
|
||||
run: |
|
||||
python -m pip install tox
|
||||
- name: Run benchmarks
|
||||
run: tox
|
||||
env:
|
||||
TOXENV: perf{,-ref}
|
||||
|
||||
diffcov:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.9
|
||||
- name: Install tox
|
||||
run: |
|
||||
python -m pip install tox
|
||||
- name: Evaluate coverage
|
||||
run: tox
|
||||
env:
|
||||
TOXENV: diffcov
|
||||
|
||||
release:
|
||||
needs: test
|
||||
if: github.event_name == 'push' && contains(github.ref, 'refs/tags/')
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.9
|
||||
- name: Install tox
|
||||
run: |
|
||||
python -m pip install tox
|
||||
- name: Release
|
||||
run: tox -e release
|
||||
env:
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
@ -1,13 +0,0 @@
|
|||
build
|
||||
/coverage.xml
|
||||
/diffcov.html
|
||||
htmlcov
|
||||
importlib_metadata.egg-info
|
||||
.mypy_cache
|
||||
/.coverage
|
||||
/.DS_Store
|
||||
artifacts
|
||||
.eggs
|
||||
.doctrees
|
||||
dist
|
||||
pip-wheel-metadata
|
|
@ -0,0 +1,50 @@
|
|||
image: quay.io/python-devs/ci-image
|
||||
|
||||
stages:
|
||||
- test
|
||||
- qa
|
||||
- docs
|
||||
- codecov
|
||||
- deploy
|
||||
|
||||
qa:
|
||||
script:
|
||||
- tox -e qa
|
||||
|
||||
tests:
|
||||
script:
|
||||
- tox -e py27,py35,py36,py37,py38
|
||||
|
||||
coverage:
|
||||
script:
|
||||
- tox -e py27-cov,py35-cov,py36-cov,py37-cov,py38-cov
|
||||
artifacts:
|
||||
paths:
|
||||
- coverage.xml
|
||||
|
||||
benchmark:
|
||||
script:
|
||||
- tox -e perf
|
||||
|
||||
diffcov:
|
||||
script:
|
||||
- tox -e py27-diffcov,py35-diffcov,py36-diffcov,py37-diffcov,py38-diffcov
|
||||
|
||||
docs:
|
||||
script:
|
||||
- tox -e docs
|
||||
|
||||
codecov:
|
||||
stage: codecov
|
||||
dependencies:
|
||||
- coverage
|
||||
script:
|
||||
- codecov
|
||||
when: on_success
|
||||
|
||||
release:
|
||||
stage: deploy
|
||||
only:
|
||||
- /^v\d+\.\d+(\.\d+)?([abc]\d*)?$/
|
||||
script:
|
||||
- tox -e release
|
|
@ -1,10 +0,0 @@
|
|||
repos:
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 20.8b1
|
||||
hooks:
|
||||
- id: black
|
||||
|
||||
- repo: https://github.com/asottile/blacken-docs
|
||||
rev: v1.9.1
|
||||
hooks:
|
||||
- id: blacken-docs
|
|
@ -1,6 +1,5 @@
|
|||
version: 2
|
||||
python:
|
||||
install:
|
||||
- path: .
|
||||
extra_requirements:
|
||||
- docs
|
||||
version: 3
|
||||
extra_requirements:
|
||||
- docs
|
||||
pip_install: true
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
include *.py MANIFEST.in LICENSE README.rst
|
||||
global-include *.txt *.rst *.ini *.cfg *.toml *.whl *.egg
|
||||
exclude .gitignore
|
||||
prune build
|
||||
prune .tox
|
|
@ -1,38 +1,17 @@
|
|||
Metadata-Version: 2.1
|
||||
Name: importlib_metadata
|
||||
Version: 3.10.1
|
||||
Version: 1.7.0
|
||||
Summary: Read metadata from Python packages
|
||||
Home-page: https://github.com/python/importlib_metadata
|
||||
Author: Jason R. Coombs
|
||||
Author-email: jaraco@jaraco.com
|
||||
License: UNKNOWN
|
||||
Description: .. image:: https://img.shields.io/pypi/v/importlib_metadata.svg
|
||||
:target: `PyPI link`_
|
||||
Home-page: http://importlib-metadata.readthedocs.io/
|
||||
Author: Barry Warsaw
|
||||
Author-email: barry@python.org
|
||||
License: Apache Software License
|
||||
Description: =========================
|
||||
``importlib_metadata``
|
||||
=========================
|
||||
|
||||
.. image:: https://img.shields.io/pypi/pyversions/importlib_metadata.svg
|
||||
:target: `PyPI link`_
|
||||
|
||||
.. _PyPI link: https://pypi.org/project/importlib_metadata
|
||||
|
||||
.. image:: https://github.com/python/importlib_metadata/workflows/tests/badge.svg
|
||||
:target: https://github.com/python/importlib_metadata/actions?query=workflow%3A%22tests%22
|
||||
:alt: tests
|
||||
|
||||
.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
|
||||
:target: https://github.com/psf/black
|
||||
:alt: Code style: Black
|
||||
|
||||
.. image:: https://readthedocs.org/projects/importlib-metadata/badge/?version=latest
|
||||
:target: https://importlib-metadata.readthedocs.io/en/latest/?badge=latest
|
||||
|
||||
|
||||
Library to access the metadata for a Python package.
|
||||
|
||||
As of Python 3.8, this functionality has been added to the
|
||||
`Python standard library
|
||||
<https://docs.python.org/3/library/importlib.metadata.html>`_.
|
||||
This package supplies backports of that functionality including
|
||||
improvements added to subsequent Python versions.
|
||||
``importlib_metadata`` is a library to access the metadata for a Python
|
||||
package. It is intended to be ported to Python 3.8.
|
||||
|
||||
|
||||
Usage
|
||||
|
@ -59,17 +38,18 @@ Description: .. image:: https://img.shields.io/pypi/v/importlib_metadata.svg
|
|||
Project details
|
||||
===============
|
||||
|
||||
* Project home: https://github.com/python/importlib_metadata
|
||||
* Report bugs at: https://github.com/python/importlib_metadata/issues
|
||||
* Code hosting: https://github.com/python/importlib_metadata
|
||||
* Documentation: https://importlib_metadata.readthedocs.io/
|
||||
* Project home: https://gitlab.com/python-devs/importlib_metadata
|
||||
* Report bugs at: https://gitlab.com/python-devs/importlib_metadata/issues
|
||||
* Code hosting: https://gitlab.com/python-devs/importlib_metadata.git
|
||||
* Documentation: http://importlib_metadata.readthedocs.io/
|
||||
|
||||
Platform: UNKNOWN
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Development Status :: 3 - Alpha
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: Apache Software License
|
||||
Classifier: Topic :: Software Development :: Libraries
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Requires-Python: >=3.6
|
||||
Classifier: Programming Language :: Python :: 2
|
||||
Requires-Python: !=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7
|
||||
Provides-Extra: testing
|
||||
Provides-Extra: docs
|
||||
|
|
|
@ -1,30 +1,9 @@
|
|||
.. image:: https://img.shields.io/pypi/v/importlib_metadata.svg
|
||||
:target: `PyPI link`_
|
||||
=========================
|
||||
``importlib_metadata``
|
||||
=========================
|
||||
|
||||
.. image:: https://img.shields.io/pypi/pyversions/importlib_metadata.svg
|
||||
:target: `PyPI link`_
|
||||
|
||||
.. _PyPI link: https://pypi.org/project/importlib_metadata
|
||||
|
||||
.. image:: https://github.com/python/importlib_metadata/workflows/tests/badge.svg
|
||||
:target: https://github.com/python/importlib_metadata/actions?query=workflow%3A%22tests%22
|
||||
:alt: tests
|
||||
|
||||
.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
|
||||
:target: https://github.com/psf/black
|
||||
:alt: Code style: Black
|
||||
|
||||
.. image:: https://readthedocs.org/projects/importlib-metadata/badge/?version=latest
|
||||
:target: https://importlib-metadata.readthedocs.io/en/latest/?badge=latest
|
||||
|
||||
|
||||
Library to access the metadata for a Python package.
|
||||
|
||||
As of Python 3.8, this functionality has been added to the
|
||||
`Python standard library
|
||||
<https://docs.python.org/3/library/importlib.metadata.html>`_.
|
||||
This package supplies backports of that functionality including
|
||||
improvements added to subsequent Python versions.
|
||||
``importlib_metadata`` is a library to access the metadata for a Python
|
||||
package. It is intended to be ported to Python 3.8.
|
||||
|
||||
|
||||
Usage
|
||||
|
@ -51,7 +30,7 @@ tools (or other conforming packages). It does not support:
|
|||
Project details
|
||||
===============
|
||||
|
||||
* Project home: https://github.com/python/importlib_metadata
|
||||
* Report bugs at: https://github.com/python/importlib_metadata/issues
|
||||
* Code hosting: https://github.com/python/importlib_metadata
|
||||
* Documentation: https://importlib_metadata.readthedocs.io/
|
||||
* Project home: https://gitlab.com/python-devs/importlib_metadata
|
||||
* Report bugs at: https://gitlab.com/python-devs/importlib_metadata/issues
|
||||
* Code hosting: https://gitlab.com/python-devs/importlib_metadata.git
|
||||
* Documentation: http://importlib_metadata.readthedocs.io/
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
codecov:
|
||||
token: 5eb1bc45-1b7f-43e6-8bc1-f2b02833dba9
|
|
@ -1,4 +0,0 @@
|
|||
collect_ignore = [
|
||||
# this module fails mypy tests because 'setup.py' matches './setup.py'
|
||||
'prepare/example/setup.py',
|
||||
]
|
|
@ -0,0 +1,24 @@
|
|||
[run]
|
||||
branch = true
|
||||
parallel = true
|
||||
omit =
|
||||
setup*
|
||||
.tox/*/lib/python*
|
||||
*/tests/*.py
|
||||
*/testing/*.py
|
||||
/usr/local/*
|
||||
*/mod.py
|
||||
plugins =
|
||||
coverplug
|
||||
|
||||
[report]
|
||||
exclude_lines =
|
||||
pragma: nocover
|
||||
raise NotImplementedError
|
||||
raise AssertionError
|
||||
assert\s
|
||||
nocoverpy${PYV}
|
||||
|
||||
[paths]
|
||||
source =
|
||||
importlib_metadata
|
|
@ -0,0 +1,21 @@
|
|||
"""Coverage plugin to add exclude lines based on the Python version."""
|
||||
|
||||
import sys
|
||||
|
||||
from coverage import CoveragePlugin
|
||||
|
||||
|
||||
class MyConfigPlugin(CoveragePlugin):
|
||||
def configure(self, config):
|
||||
opt_name = 'report:exclude_lines'
|
||||
exclude_lines = config.get_option(opt_name)
|
||||
# Python >= 3.6 has os.PathLike.
|
||||
if sys.version_info >= (3, 6):
|
||||
exclude_lines.append('pragma: >=36')
|
||||
else:
|
||||
exclude_lines.append('pragma: <=35')
|
||||
config.set_option(opt_name, exclude_lines)
|
||||
|
||||
|
||||
def coverage_init(reg, options):
|
||||
reg.add_configurer(MyConfigPlugin())
|
|
@ -1,38 +1,17 @@
|
|||
Metadata-Version: 2.1
|
||||
Name: importlib-metadata
|
||||
Version: 3.10.1
|
||||
Version: 1.7.0
|
||||
Summary: Read metadata from Python packages
|
||||
Home-page: https://github.com/python/importlib_metadata
|
||||
Author: Jason R. Coombs
|
||||
Author-email: jaraco@jaraco.com
|
||||
License: UNKNOWN
|
||||
Description: .. image:: https://img.shields.io/pypi/v/importlib_metadata.svg
|
||||
:target: `PyPI link`_
|
||||
Home-page: http://importlib-metadata.readthedocs.io/
|
||||
Author: Barry Warsaw
|
||||
Author-email: barry@python.org
|
||||
License: Apache Software License
|
||||
Description: =========================
|
||||
``importlib_metadata``
|
||||
=========================
|
||||
|
||||
.. image:: https://img.shields.io/pypi/pyversions/importlib_metadata.svg
|
||||
:target: `PyPI link`_
|
||||
|
||||
.. _PyPI link: https://pypi.org/project/importlib_metadata
|
||||
|
||||
.. image:: https://github.com/python/importlib_metadata/workflows/tests/badge.svg
|
||||
:target: https://github.com/python/importlib_metadata/actions?query=workflow%3A%22tests%22
|
||||
:alt: tests
|
||||
|
||||
.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
|
||||
:target: https://github.com/psf/black
|
||||
:alt: Code style: Black
|
||||
|
||||
.. image:: https://readthedocs.org/projects/importlib-metadata/badge/?version=latest
|
||||
:target: https://importlib-metadata.readthedocs.io/en/latest/?badge=latest
|
||||
|
||||
|
||||
Library to access the metadata for a Python package.
|
||||
|
||||
As of Python 3.8, this functionality has been added to the
|
||||
`Python standard library
|
||||
<https://docs.python.org/3/library/importlib.metadata.html>`_.
|
||||
This package supplies backports of that functionality including
|
||||
improvements added to subsequent Python versions.
|
||||
``importlib_metadata`` is a library to access the metadata for a Python
|
||||
package. It is intended to be ported to Python 3.8.
|
||||
|
||||
|
||||
Usage
|
||||
|
@ -59,17 +38,18 @@ Description: .. image:: https://img.shields.io/pypi/v/importlib_metadata.svg
|
|||
Project details
|
||||
===============
|
||||
|
||||
* Project home: https://github.com/python/importlib_metadata
|
||||
* Report bugs at: https://github.com/python/importlib_metadata/issues
|
||||
* Code hosting: https://github.com/python/importlib_metadata
|
||||
* Documentation: https://importlib_metadata.readthedocs.io/
|
||||
* Project home: https://gitlab.com/python-devs/importlib_metadata
|
||||
* Report bugs at: https://gitlab.com/python-devs/importlib_metadata/issues
|
||||
* Code hosting: https://gitlab.com/python-devs/importlib_metadata.git
|
||||
* Documentation: http://importlib_metadata.readthedocs.io/
|
||||
|
||||
Platform: UNKNOWN
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Development Status :: 3 - Alpha
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: Apache Software License
|
||||
Classifier: Topic :: Software Development :: Libraries
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Requires-Python: >=3.6
|
||||
Classifier: Programming Language :: Python :: 2
|
||||
Requires-Python: !=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7
|
||||
Provides-Extra: testing
|
||||
Provides-Extra: docs
|
||||
|
|
|
@ -1,47 +1,35 @@
|
|||
.coveragerc
|
||||
.editorconfig
|
||||
.flake8
|
||||
.gitignore
|
||||
.pre-commit-config.yaml
|
||||
.gitlab-ci.yml
|
||||
.readthedocs.yml
|
||||
CHANGES.rst
|
||||
LICENSE
|
||||
MANIFEST.in
|
||||
README.rst
|
||||
conftest.py
|
||||
mypy.ini
|
||||
codecov.yml
|
||||
coverage.ini
|
||||
coverplug.py
|
||||
pyproject.toml
|
||||
pytest.ini
|
||||
setup.cfg
|
||||
setup.py
|
||||
skeleton.md
|
||||
tox.ini
|
||||
.github/workflows/automerge.yml
|
||||
.github/workflows/main.yml
|
||||
docs/__init__.py
|
||||
docs/conf.py
|
||||
docs/history.rst
|
||||
docs/index.rst
|
||||
docs/using.rst
|
||||
importlib_metadata/__init__.py
|
||||
importlib_metadata/_collections.py
|
||||
importlib_metadata/_compat.py
|
||||
importlib_metadata/_functools.py
|
||||
importlib_metadata/_itertools.py
|
||||
importlib_metadata/py.typed
|
||||
importlib_metadata.egg-info/PKG-INFO
|
||||
importlib_metadata.egg-info/SOURCES.txt
|
||||
importlib_metadata.egg-info/dependency_links.txt
|
||||
importlib_metadata.egg-info/requires.txt
|
||||
importlib_metadata.egg-info/top_level.txt
|
||||
importlib_metadata/docs/__init__.py
|
||||
importlib_metadata/docs/changelog.rst
|
||||
importlib_metadata/docs/conf.py
|
||||
importlib_metadata/docs/index.rst
|
||||
importlib_metadata/docs/using.rst
|
||||
importlib_metadata/tests/__init__.py
|
||||
importlib_metadata/tests/fixtures.py
|
||||
importlib_metadata/tests/test_api.py
|
||||
importlib_metadata/tests/test_integration.py
|
||||
importlib_metadata/tests/test_main.py
|
||||
importlib_metadata/tests/test_zip.py
|
||||
importlib_metadata/tests/data/__init__.py
|
||||
importlib_metadata/tests/data/example-21.12-py3-none-any.whl
|
||||
importlib_metadata/tests/data/example-21.12-py3.6.egg
|
||||
prepare/example/setup.py
|
||||
prepare/example/example/__init__.py
|
||||
tests/__init__.py
|
||||
tests/fixtures.py
|
||||
tests/py39compat.py
|
||||
tests/test_api.py
|
||||
tests/test_integration.py
|
||||
tests/test_main.py
|
||||
tests/test_zip.py
|
||||
tests/data/__init__.py
|
||||
tests/data/example-21.12-py3-none-any.whl
|
||||
tests/data/example-21.12-py3.6.egg
|
||||
prepare/example/example/__init__.py
|
|
@ -1,27 +1,17 @@
|
|||
zipp>=0.5
|
||||
|
||||
[:python_version < "3.8"]
|
||||
typing-extensions>=3.6.4
|
||||
[:python_version < "3"]
|
||||
pathlib2
|
||||
contextlib2
|
||||
configparser>=3.5
|
||||
|
||||
[docs]
|
||||
sphinx
|
||||
jaraco.packaging>=8.2
|
||||
rst.linker>=1.9
|
||||
rst.linker
|
||||
|
||||
[testing]
|
||||
pytest>=4.6
|
||||
pytest-checkdocs>=2.4
|
||||
pytest-flake8
|
||||
pytest-cov
|
||||
pytest-enabler>=1.0.1
|
||||
packaging
|
||||
pep517
|
||||
pyfakefs
|
||||
flufl.flake8
|
||||
|
||||
[testing:platform_python_implementation != "PyPy" and python_version < "3.10"]
|
||||
pytest-black>=0.3.7
|
||||
pytest-mypy
|
||||
|
||||
[testing:python_version < "3.9"]
|
||||
importlib_resources>=1.3
|
||||
|
|
|
@ -1,34 +1,41 @@
|
|||
from __future__ import unicode_literals, absolute_import
|
||||
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
import abc
|
||||
import csv
|
||||
import sys
|
||||
import zipp
|
||||
import email
|
||||
import pathlib
|
||||
import operator
|
||||
import textwrap
|
||||
import warnings
|
||||
import functools
|
||||
import itertools
|
||||
import posixpath
|
||||
import collections
|
||||
|
||||
from ._collections import FreezableDefaultDict, Pair
|
||||
from ._compat import (
|
||||
NullFinder,
|
||||
Protocol,
|
||||
PyPy_repr,
|
||||
install,
|
||||
)
|
||||
from ._functools import method_cache
|
||||
from ._itertools import unique_everseen
|
||||
|
||||
from contextlib import suppress
|
||||
NullFinder,
|
||||
ConfigParser,
|
||||
suppress,
|
||||
map,
|
||||
FileNotFoundError,
|
||||
IsADirectoryError,
|
||||
NotADirectoryError,
|
||||
PermissionError,
|
||||
pathlib,
|
||||
ModuleNotFoundError,
|
||||
MetaPathFinder,
|
||||
email_message_from_string,
|
||||
PyPy_repr,
|
||||
unique_ordered,
|
||||
str,
|
||||
)
|
||||
from importlib import import_module
|
||||
from importlib.abc import MetaPathFinder
|
||||
from itertools import starmap
|
||||
from typing import Any, List, Mapping, Optional, TypeVar, Union
|
||||
|
||||
|
||||
__metaclass__ = type
|
||||
|
||||
|
||||
__all__ = [
|
||||
|
@ -40,10 +47,9 @@ __all__ = [
|
|||
'entry_points',
|
||||
'files',
|
||||
'metadata',
|
||||
'packages_distributions',
|
||||
'requires',
|
||||
'version',
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
class PackageNotFoundError(ModuleNotFoundError):
|
||||
|
@ -55,78 +61,13 @@ class PackageNotFoundError(ModuleNotFoundError):
|
|||
|
||||
@property
|
||||
def name(self):
|
||||
(name,) = self.args
|
||||
name, = self.args
|
||||
return name
|
||||
|
||||
|
||||
class Sectioned:
|
||||
"""
|
||||
A simple entry point config parser for performance
|
||||
|
||||
>>> for item in Sectioned.read(Sectioned._sample):
|
||||
... print(item)
|
||||
Pair(name='sec1', value='# comments ignored')
|
||||
Pair(name='sec1', value='a = 1')
|
||||
Pair(name='sec1', value='b = 2')
|
||||
Pair(name='sec2', value='a = 2')
|
||||
|
||||
>>> res = Sectioned.section_pairs(Sectioned._sample)
|
||||
>>> item = next(res)
|
||||
>>> item.name
|
||||
'sec1'
|
||||
>>> item.value
|
||||
Pair(name='a', value='1')
|
||||
>>> item = next(res)
|
||||
>>> item.value
|
||||
Pair(name='b', value='2')
|
||||
>>> item = next(res)
|
||||
>>> item.name
|
||||
'sec2'
|
||||
>>> item.value
|
||||
Pair(name='a', value='2')
|
||||
>>> list(res)
|
||||
[]
|
||||
"""
|
||||
|
||||
_sample = textwrap.dedent(
|
||||
"""
|
||||
[sec1]
|
||||
# comments ignored
|
||||
a = 1
|
||||
b = 2
|
||||
|
||||
[sec2]
|
||||
a = 2
|
||||
"""
|
||||
).lstrip()
|
||||
|
||||
@classmethod
|
||||
def section_pairs(cls, text):
|
||||
return (
|
||||
section._replace(value=Pair.parse(section.value))
|
||||
for section in cls.read(text, filter_=cls.valid)
|
||||
if section.name is not None
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def read(text, filter_=None):
|
||||
lines = filter(filter_, map(str.strip, text.splitlines()))
|
||||
name = None
|
||||
for value in lines:
|
||||
section_match = value.startswith('[') and value.endswith(']')
|
||||
if section_match:
|
||||
name = value.strip('[]')
|
||||
continue
|
||||
yield Pair(name, value)
|
||||
|
||||
@staticmethod
|
||||
def valid(line):
|
||||
return line and not line.startswith('#')
|
||||
|
||||
|
||||
class EntryPoint(
|
||||
PyPy_repr, collections.namedtuple('EntryPointBase', 'name value group')
|
||||
):
|
||||
PyPy_repr,
|
||||
collections.namedtuple('EntryPointBase', 'name value group')):
|
||||
"""An entry point as defined by Python packaging conventions.
|
||||
|
||||
See `the packaging docs on entry points
|
||||
|
@ -138,7 +79,7 @@ class EntryPoint(
|
|||
r'(?P<module>[\w.]+)\s*'
|
||||
r'(:\s*(?P<attr>[\w.]+))?\s*'
|
||||
r'(?P<extras>\[.*\])?\s*$'
|
||||
)
|
||||
)
|
||||
"""
|
||||
A regular expression describing the syntax for an entry point,
|
||||
which might look like:
|
||||
|
@ -155,8 +96,6 @@ class EntryPoint(
|
|||
following the attr, and following any extras.
|
||||
"""
|
||||
|
||||
dist: Optional['Distribution'] = None
|
||||
|
||||
def load(self):
|
||||
"""Load the entry point from its definition. If only a module
|
||||
is indicated by the value, return that module. Otherwise,
|
||||
|
@ -182,190 +121,37 @@ class EntryPoint(
|
|||
match = self.pattern.match(self.value)
|
||||
return list(re.finditer(r'\w+', match.group('extras') or ''))
|
||||
|
||||
def _for(self, dist):
|
||||
self.dist = dist
|
||||
return self
|
||||
@classmethod
|
||||
def _from_config(cls, config):
|
||||
return [
|
||||
cls(name, value, group)
|
||||
for group in config.sections()
|
||||
for name, value in config.items(group)
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def _from_text(cls, text):
|
||||
config = ConfigParser(delimiters='=')
|
||||
# case sensitive: https://stackoverflow.com/q/1611799/812183
|
||||
config.optionxform = str
|
||||
try:
|
||||
config.read_string(text)
|
||||
except AttributeError: # pragma: nocover
|
||||
# Python 2 has no read_string
|
||||
config.readfp(io.StringIO(text))
|
||||
return EntryPoint._from_config(config)
|
||||
|
||||
def __iter__(self):
|
||||
"""
|
||||
Supply iter so one may construct dicts of EntryPoints by name.
|
||||
Supply iter so one may construct dicts of EntryPoints easily.
|
||||
"""
|
||||
msg = (
|
||||
"Construction of dict of EntryPoints is deprecated in "
|
||||
"favor of EntryPoints."
|
||||
)
|
||||
warnings.warn(msg, DeprecationWarning)
|
||||
return iter((self.name, self))
|
||||
|
||||
def __reduce__(self):
|
||||
return (
|
||||
self.__class__,
|
||||
(self.name, self.value, self.group),
|
||||
)
|
||||
|
||||
def matches(self, **params):
|
||||
attrs = (getattr(self, param) for param in params)
|
||||
return all(map(operator.eq, params.values(), attrs))
|
||||
|
||||
|
||||
class EntryPoints(tuple):
|
||||
"""
|
||||
An immutable collection of selectable EntryPoint objects.
|
||||
"""
|
||||
|
||||
__slots__ = ()
|
||||
|
||||
def __getitem__(self, name): # -> EntryPoint:
|
||||
"""
|
||||
Get the EntryPoint in self matching name.
|
||||
"""
|
||||
try:
|
||||
return next(iter(self.select(name=name)))
|
||||
except StopIteration:
|
||||
raise KeyError(name)
|
||||
|
||||
def select(self, **params):
|
||||
"""
|
||||
Select entry points from self that match the
|
||||
given parameters (typically group and/or name).
|
||||
"""
|
||||
return EntryPoints(ep for ep in self if ep.matches(**params))
|
||||
|
||||
@property
|
||||
def names(self):
|
||||
"""
|
||||
Return the set of all names of all entry points.
|
||||
"""
|
||||
return set(ep.name for ep in self)
|
||||
|
||||
@property
|
||||
def groups(self):
|
||||
"""
|
||||
Return the set of all groups of all entry points.
|
||||
|
||||
For coverage while SelectableGroups is present.
|
||||
>>> EntryPoints().groups
|
||||
set()
|
||||
"""
|
||||
return set(ep.group for ep in self)
|
||||
|
||||
@classmethod
|
||||
def _from_text_for(cls, text, dist):
|
||||
return cls(ep._for(dist) for ep in cls._from_text(text))
|
||||
|
||||
@classmethod
|
||||
def _from_text(cls, text):
|
||||
return itertools.starmap(EntryPoint, cls._parse_groups(text or ''))
|
||||
|
||||
@staticmethod
|
||||
def _parse_groups(text):
|
||||
return (
|
||||
(item.value.name, item.value.value, item.name)
|
||||
for item in Sectioned.section_pairs(text)
|
||||
)
|
||||
|
||||
|
||||
def flake8_bypass(func):
|
||||
# defer inspect import as performance optimization.
|
||||
import inspect
|
||||
|
||||
is_flake8 = any('flake8' in str(frame.filename) for frame in inspect.stack()[:5])
|
||||
return func if not is_flake8 else lambda: None
|
||||
|
||||
|
||||
class Deprecated:
|
||||
"""
|
||||
Compatibility add-in for mapping to indicate that
|
||||
mapping behavior is deprecated.
|
||||
|
||||
>>> recwarn = getfixture('recwarn')
|
||||
>>> class DeprecatedDict(Deprecated, dict): pass
|
||||
>>> dd = DeprecatedDict(foo='bar')
|
||||
>>> dd.get('baz', None)
|
||||
>>> dd['foo']
|
||||
'bar'
|
||||
>>> list(dd)
|
||||
['foo']
|
||||
>>> list(dd.keys())
|
||||
['foo']
|
||||
>>> 'foo' in dd
|
||||
True
|
||||
>>> list(dd.values())
|
||||
['bar']
|
||||
>>> len(recwarn)
|
||||
1
|
||||
"""
|
||||
|
||||
_warn = functools.partial(
|
||||
warnings.warn,
|
||||
"SelectableGroups dict interface is deprecated. Use select.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
def __getitem__(self, name):
|
||||
self._warn()
|
||||
return super().__getitem__(name)
|
||||
|
||||
def get(self, name, default=None):
|
||||
flake8_bypass(self._warn)()
|
||||
return super().get(name, default)
|
||||
|
||||
def __iter__(self):
|
||||
self._warn()
|
||||
return super().__iter__()
|
||||
|
||||
def __contains__(self, *args):
|
||||
self._warn()
|
||||
return super().__contains__(*args)
|
||||
|
||||
def keys(self):
|
||||
self._warn()
|
||||
return super().keys()
|
||||
|
||||
def values(self):
|
||||
self._warn()
|
||||
return super().values()
|
||||
|
||||
|
||||
class SelectableGroups(Deprecated, dict):
|
||||
"""
|
||||
A backward- and forward-compatible result from
|
||||
entry_points that fully implements the dict interface.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def load(cls, eps):
|
||||
by_group = operator.attrgetter('group')
|
||||
ordered = sorted(eps, key=by_group)
|
||||
grouped = itertools.groupby(ordered, by_group)
|
||||
return cls((group, EntryPoints(eps)) for group, eps in grouped)
|
||||
|
||||
@property
|
||||
def _all(self):
|
||||
"""
|
||||
Reconstruct a list of all entrypoints from the groups.
|
||||
"""
|
||||
groups = super(Deprecated, self).values()
|
||||
return EntryPoints(itertools.chain.from_iterable(groups))
|
||||
|
||||
@property
|
||||
def groups(self):
|
||||
return self._all.groups
|
||||
|
||||
@property
|
||||
def names(self):
|
||||
"""
|
||||
for coverage:
|
||||
>>> SelectableGroups().names
|
||||
set()
|
||||
"""
|
||||
return self._all.names
|
||||
|
||||
def select(self, **params):
|
||||
if not params:
|
||||
return self
|
||||
return self._all.select(**params)
|
||||
)
|
||||
|
||||
|
||||
class PackagePath(pathlib.PurePosixPath):
|
||||
|
@ -392,25 +178,6 @@ class FileHash:
|
|||
return '<FileHash mode: {} value: {}>'.format(self.mode, self.value)
|
||||
|
||||
|
||||
_T = TypeVar("_T")
|
||||
|
||||
|
||||
class PackageMetadata(Protocol):
|
||||
def __len__(self) -> int:
|
||||
... # pragma: no cover
|
||||
|
||||
def __contains__(self, item: str) -> bool:
|
||||
... # pragma: no cover
|
||||
|
||||
def __getitem__(self, key: str) -> str:
|
||||
... # pragma: no cover
|
||||
|
||||
def get_all(self, name: str, failobj: _T = ...) -> Union[List[Any], _T]:
|
||||
"""
|
||||
Return all values associated with a possibly multi-valued key.
|
||||
"""
|
||||
|
||||
|
||||
class Distribution:
|
||||
"""A Python distribution package."""
|
||||
|
||||
|
@ -462,8 +229,9 @@ class Distribution:
|
|||
raise ValueError("cannot accept context and kwargs")
|
||||
context = context or DistributionFinder.Context(**kwargs)
|
||||
return itertools.chain.from_iterable(
|
||||
resolver(context) for resolver in cls._discover_resolvers()
|
||||
)
|
||||
resolver(context)
|
||||
for resolver in cls._discover_resolvers()
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def at(path):
|
||||
|
@ -478,24 +246,24 @@ class Distribution:
|
|||
def _discover_resolvers():
|
||||
"""Search the meta_path for resolvers."""
|
||||
declared = (
|
||||
getattr(finder, 'find_distributions', None) for finder in sys.meta_path
|
||||
)
|
||||
getattr(finder, 'find_distributions', None)
|
||||
for finder in sys.meta_path
|
||||
)
|
||||
return filter(None, declared)
|
||||
|
||||
@classmethod
|
||||
def _local(cls, root='.'):
|
||||
from pep517 import build, meta
|
||||
|
||||
system = build.compat_system(root)
|
||||
builder = functools.partial(
|
||||
meta.build,
|
||||
source_dir=root,
|
||||
system=system,
|
||||
)
|
||||
)
|
||||
return PathDistribution(zipp.Path(meta.build_as_zip(builder)))
|
||||
|
||||
@property
|
||||
def metadata(self) -> PackageMetadata:
|
||||
def metadata(self):
|
||||
"""Return the parsed metadata for this Distribution.
|
||||
|
||||
The returned object will have keys that name the various bits of
|
||||
|
@ -508,13 +276,8 @@ class Distribution:
|
|||
# effect is to just end up using the PathDistribution's self._path
|
||||
# (which points to the egg-info file) attribute unchanged.
|
||||
or self.read_text('')
|
||||
)
|
||||
return email.message_from_string(text)
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
"""Return the 'Name' metadata for the distribution package."""
|
||||
return self.metadata['Name']
|
||||
)
|
||||
return email_message_from_string(text)
|
||||
|
||||
@property
|
||||
def version(self):
|
||||
|
@ -523,7 +286,7 @@ class Distribution:
|
|||
|
||||
@property
|
||||
def entry_points(self):
|
||||
return EntryPoints._from_text_for(self.read_text('entry_points.txt'), self)
|
||||
return EntryPoint._from_text(self.read_text('entry_points.txt'))
|
||||
|
||||
@property
|
||||
def files(self):
|
||||
|
@ -577,7 +340,23 @@ class Distribution:
|
|||
|
||||
@classmethod
|
||||
def _deps_from_requires_text(cls, source):
|
||||
return cls._convert_egg_info_reqs_to_simple_reqs(Sectioned.read(source))
|
||||
section_pairs = cls._read_sections(source.splitlines())
|
||||
sections = {
|
||||
section: list(map(operator.itemgetter('line'), results))
|
||||
for section, results in
|
||||
itertools.groupby(section_pairs, operator.itemgetter('section'))
|
||||
}
|
||||
return cls._convert_egg_info_reqs_to_simple_reqs(sections)
|
||||
|
||||
@staticmethod
|
||||
def _read_sections(lines):
|
||||
section = None
|
||||
for line in filter(None, lines):
|
||||
section_match = re.match(r'\[(.*)\]$', line)
|
||||
if section_match:
|
||||
section = section_match.group(1)
|
||||
continue
|
||||
yield locals()
|
||||
|
||||
@staticmethod
|
||||
def _convert_egg_info_reqs_to_simple_reqs(sections):
|
||||
|
@ -590,7 +369,6 @@ class Distribution:
|
|||
requirement. This method converts the former to the
|
||||
latter. See _test_deps_from_requires_text for an example.
|
||||
"""
|
||||
|
||||
def make_condition(name):
|
||||
return name and 'extra == "{name}"'.format(name=name)
|
||||
|
||||
|
@ -602,8 +380,9 @@ class Distribution:
|
|||
conditions = list(filter(None, [markers, make_condition(extra)]))
|
||||
return '; ' + ' and '.join(conditions) if conditions else ''
|
||||
|
||||
for section in sections:
|
||||
yield section.value + parse_condition(section.name)
|
||||
for section, deps in sections.items():
|
||||
for dep in deps:
|
||||
yield dep + parse_condition(section)
|
||||
|
||||
|
||||
class DistributionFinder(MetaPathFinder):
|
||||
|
@ -659,12 +438,9 @@ class FastPath:
|
|||
children.
|
||||
"""
|
||||
|
||||
@functools.lru_cache() # type: ignore
|
||||
def __new__(cls, root):
|
||||
return super().__new__(cls)
|
||||
|
||||
def __init__(self, root):
|
||||
self.root = str(root)
|
||||
self.base = os.path.basename(self.root).lower()
|
||||
|
||||
def joinpath(self, child):
|
||||
return pathlib.Path(self.root, child)
|
||||
|
@ -681,90 +457,48 @@ class FastPath:
|
|||
names = zip_path.root.namelist()
|
||||
self.joinpath = zip_path.joinpath
|
||||
|
||||
return dict.fromkeys(child.split(posixpath.sep, 1)[0] for child in names)
|
||||
return unique_ordered(
|
||||
child.split(posixpath.sep, 1)[0]
|
||||
for child in names
|
||||
)
|
||||
|
||||
def is_egg(self, search):
|
||||
base = self.base
|
||||
return (
|
||||
base == search.versionless_egg_name
|
||||
or base.startswith(search.prefix)
|
||||
and base.endswith('.egg'))
|
||||
|
||||
def search(self, name):
|
||||
return self.lookup(self.mtime).search(name)
|
||||
|
||||
@property
|
||||
def mtime(self):
|
||||
with suppress(OSError):
|
||||
return os.stat(self.root).st_mtime
|
||||
self.lookup.cache_clear()
|
||||
|
||||
@method_cache
|
||||
def lookup(self, mtime):
|
||||
return Lookup(self)
|
||||
|
||||
|
||||
class Lookup:
|
||||
def __init__(self, path: FastPath):
|
||||
base = os.path.basename(path.root).lower()
|
||||
base_is_egg = base.endswith(".egg")
|
||||
self.infos = FreezableDefaultDict(list)
|
||||
self.eggs = FreezableDefaultDict(list)
|
||||
|
||||
for child in path.children():
|
||||
low = child.lower()
|
||||
if low.endswith((".dist-info", ".egg-info")):
|
||||
# rpartition is faster than splitext and suitable for this purpose.
|
||||
name = low.rpartition(".")[0].partition("-")[0]
|
||||
normalized = Prepared.normalize(name)
|
||||
self.infos[normalized].append(path.joinpath(child))
|
||||
elif base_is_egg and low == "egg-info":
|
||||
name = base.rpartition(".")[0].partition("-")[0]
|
||||
legacy_normalized = Prepared.legacy_normalize(name)
|
||||
self.eggs[legacy_normalized].append(path.joinpath(child))
|
||||
|
||||
self.infos.freeze()
|
||||
self.eggs.freeze()
|
||||
|
||||
def search(self, prepared):
|
||||
infos = (
|
||||
self.infos[prepared.normalized]
|
||||
if prepared
|
||||
else itertools.chain.from_iterable(self.infos.values())
|
||||
)
|
||||
eggs = (
|
||||
self.eggs[prepared.legacy_normalized]
|
||||
if prepared
|
||||
else itertools.chain.from_iterable(self.eggs.values())
|
||||
)
|
||||
return itertools.chain(infos, eggs)
|
||||
for child in self.children():
|
||||
n_low = child.lower()
|
||||
if (n_low in name.exact_matches
|
||||
or n_low.startswith(name.prefix)
|
||||
and n_low.endswith(name.suffixes)
|
||||
# legacy case:
|
||||
or self.is_egg(name) and n_low == 'egg-info'):
|
||||
yield self.joinpath(child)
|
||||
|
||||
|
||||
class Prepared:
|
||||
"""
|
||||
A prepared search for metadata on a possibly-named package.
|
||||
"""
|
||||
|
||||
normalized = None
|
||||
legacy_normalized = None
|
||||
normalized = ''
|
||||
prefix = ''
|
||||
suffixes = '.dist-info', '.egg-info'
|
||||
exact_matches = [''][:0]
|
||||
versionless_egg_name = ''
|
||||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
if name is None:
|
||||
return
|
||||
self.normalized = self.normalize(name)
|
||||
self.legacy_normalized = self.legacy_normalize(name)
|
||||
|
||||
@staticmethod
|
||||
def normalize(name):
|
||||
"""
|
||||
PEP 503 normalization plus dashes as underscores.
|
||||
"""
|
||||
return re.sub(r"[-_.]+", "-", name).lower().replace('-', '_')
|
||||
|
||||
@staticmethod
|
||||
def legacy_normalize(name):
|
||||
"""
|
||||
Normalize the package name as found in the convention in
|
||||
older packaging tools versions and specs.
|
||||
"""
|
||||
return name.lower().replace('-', '_')
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self.name)
|
||||
self.normalized = name.lower().replace('-', '_')
|
||||
self.prefix = self.normalized + '-'
|
||||
self.exact_matches = [
|
||||
self.normalized + suffix for suffix in self.suffixes]
|
||||
self.versionless_egg_name = self.normalized + '.egg'
|
||||
|
||||
|
||||
@install
|
||||
|
@ -790,13 +524,10 @@ class MetadataPathFinder(NullFinder, DistributionFinder):
|
|||
@classmethod
|
||||
def _search_paths(cls, name, paths):
|
||||
"""Find metadata directories in paths heuristically."""
|
||||
prepared = Prepared(name)
|
||||
return itertools.chain.from_iterable(
|
||||
path.search(prepared) for path in map(FastPath, paths)
|
||||
)
|
||||
|
||||
def invalidate_caches(cls):
|
||||
FastPath.__new__.cache_clear()
|
||||
path.search(Prepared(name))
|
||||
for path in map(FastPath, paths)
|
||||
)
|
||||
|
||||
|
||||
class PathDistribution(Distribution):
|
||||
|
@ -809,15 +540,9 @@ class PathDistribution(Distribution):
|
|||
self._path = path
|
||||
|
||||
def read_text(self, filename):
|
||||
with suppress(
|
||||
FileNotFoundError,
|
||||
IsADirectoryError,
|
||||
KeyError,
|
||||
NotADirectoryError,
|
||||
PermissionError,
|
||||
):
|
||||
with suppress(FileNotFoundError, IsADirectoryError, KeyError,
|
||||
NotADirectoryError, PermissionError):
|
||||
return self._path.joinpath(filename).read_text(encoding='utf-8')
|
||||
|
||||
read_text.__doc__ = Distribution.read_text.__doc__
|
||||
|
||||
def locate_file(self, path):
|
||||
|
@ -841,11 +566,11 @@ def distributions(**kwargs):
|
|||
return Distribution.discover(**kwargs)
|
||||
|
||||
|
||||
def metadata(distribution_name) -> PackageMetadata:
|
||||
def metadata(distribution_name):
|
||||
"""Get the metadata for the named package.
|
||||
|
||||
:param distribution_name: The name of the distribution package to query.
|
||||
:return: A PackageMetadata containing the parsed metadata.
|
||||
:return: An email.Message containing the parsed metadata.
|
||||
"""
|
||||
return Distribution.from_name(distribution_name).metadata
|
||||
|
||||
|
@ -860,28 +585,20 @@ def version(distribution_name):
|
|||
return distribution(distribution_name).version
|
||||
|
||||
|
||||
def entry_points(**params) -> Union[EntryPoints, SelectableGroups]:
|
||||
def entry_points():
|
||||
"""Return EntryPoint objects for all installed packages.
|
||||
|
||||
Pass selection parameters (group or name) to filter the
|
||||
result to entry points matching those properties (see
|
||||
EntryPoints.select()).
|
||||
|
||||
For compatibility, returns ``SelectableGroups`` object unless
|
||||
selection parameters are supplied. In the future, this function
|
||||
will return ``EntryPoints`` instead of ``SelectableGroups``
|
||||
even when no selection parameters are supplied.
|
||||
|
||||
For maximum future compatibility, pass selection parameters
|
||||
or invoke ``.select`` with parameters on the result.
|
||||
|
||||
:return: EntryPoints or SelectableGroups for all installed packages.
|
||||
:return: EntryPoint objects for all installed packages.
|
||||
"""
|
||||
unique = functools.partial(unique_everseen, key=operator.attrgetter('name'))
|
||||
eps = itertools.chain.from_iterable(
|
||||
dist.entry_points for dist in unique(distributions())
|
||||
)
|
||||
return SelectableGroups.load(eps).select(**params)
|
||||
dist.entry_points for dist in distributions())
|
||||
by_group = operator.attrgetter('group')
|
||||
ordered = sorted(eps, key=by_group)
|
||||
grouped = itertools.groupby(ordered, by_group)
|
||||
return {
|
||||
group: tuple(eps)
|
||||
for group, eps in grouped
|
||||
}
|
||||
|
||||
|
||||
def files(distribution_name):
|
||||
|
@ -903,18 +620,4 @@ def requires(distribution_name):
|
|||
return distribution(distribution_name).requires
|
||||
|
||||
|
||||
def packages_distributions() -> Mapping[str, List[str]]:
|
||||
"""
|
||||
Return a mapping of top-level packages to their
|
||||
distributions.
|
||||
|
||||
>>> import collections.abc
|
||||
>>> pkgs = packages_distributions()
|
||||
>>> all(isinstance(dist, collections.abc.Sequence) for dist in pkgs.values())
|
||||
True
|
||||
"""
|
||||
pkg_to_dist = collections.defaultdict(list)
|
||||
for dist in distributions():
|
||||
for pkg in (dist.read_text('top_level.txt') or '').split():
|
||||
pkg_to_dist[pkg].append(dist.metadata['Name'])
|
||||
return dict(pkg_to_dist)
|
||||
__version__ = version(__name__)
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче