зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1467516 [wpt PR 11380] - Update html5lib and six, a=testonly
Automatic update from web-platform-testsFix #7200: Update vendored html5lib to 1.0.1 This adds webencodings as another vendored package, and moves both to third_party (part of #10922). -- Fix #10922: move six into third_party and update to 1.11 -- wpt-commits: efdb898172298b29a50c2e39cd40ac191ee8b383, 7cd0b885a529734ef71afd3254df48f57f255512 wpt-pr: 11380 --HG-- rename : testing/web-platform/tests/tools/html5lib/.gitmodules => testing/web-platform/tests/tools/third_party/html5lib/.gitmodules rename : testing/web-platform/tests/tools/html5lib/CONTRIBUTING.rst => testing/web-platform/tests/tools/third_party/html5lib/CONTRIBUTING.rst rename : testing/web-platform/tests/tools/html5lib/LICENSE => testing/web-platform/tests/tools/third_party/html5lib/LICENSE rename : testing/web-platform/tests/tools/html5lib/doc/Makefile => testing/web-platform/tests/tools/third_party/html5lib/doc/Makefile rename : testing/web-platform/tests/tools/html5lib/doc/changes.rst => testing/web-platform/tests/tools/third_party/html5lib/doc/changes.rst rename : testing/web-platform/tests/tools/html5lib/doc/license.rst => testing/web-platform/tests/tools/third_party/html5lib/doc/license.rst rename : testing/web-platform/tests/tools/html5lib/doc/make.bat => testing/web-platform/tests/tools/third_party/html5lib/doc/make.bat rename : testing/web-platform/tests/tools/html5lib/doc/modules.rst => testing/web-platform/tests/tools/third_party/html5lib/doc/modules.rst rename : testing/web-platform/tests/tools/html5lib/html5lib/trie/datrie.py => testing/web-platform/tests/tools/third_party/html5lib/html5lib/_trie/datrie.py rename : testing/web-platform/tests/tools/html5lib/html5lib/trie/py.py => testing/web-platform/tests/tools/third_party/html5lib/html5lib/_trie/py.py rename : testing/web-platform/tests/tools/html5lib/html5lib/filters/_base.py => testing/web-platform/tests/tools/third_party/html5lib/html5lib/filters/base.py rename : testing/web-platform/tests/tools/html5lib/html5lib/tests/__init__.py => testing/web-platform/tests/tools/third_party/html5lib/html5lib/tests/__init__.py rename : testing/web-platform/tests/tools/html5lib/html5lib/tests/tokenizertotree.py => testing/web-platform/tests/tools/third_party/html5lib/html5lib/tests/tokenizertotree.py rename : testing/web-platform/tests/tools/html5lib/html5lib/tests/us-ascii.html => testing/web-platform/tests/tools/third_party/html5lib/html5lib/tests/us-ascii.html rename : testing/web-platform/tests/tools/html5lib/html5lib/tests/utf-8-bom.html => testing/web-platform/tests/tools/third_party/html5lib/html5lib/tests/utf-8-bom.html rename : testing/web-platform/tests/tools/six/MANIFEST.in => testing/web-platform/tests/tools/third_party/six/MANIFEST.in rename : testing/web-platform/tests/tools/six/documentation/Makefile => testing/web-platform/tests/tools/third_party/six/documentation/Makefile rename : testing/web-platform/tests/tools/html5lib/requirements.txt => testing/web-platform/tests/tools/third_party/six/six.egg-info/top_level.txt
This commit is contained in:
Родитель
afbb50e29c
Коммит
02df1786d1
|
@ -405432,7 +405432,7 @@
|
||||||
"support"
|
"support"
|
||||||
],
|
],
|
||||||
"./.gitmodules": [
|
"./.gitmodules": [
|
||||||
"6a203e28d43909d7513daf8761281b351d2b2bd7",
|
"9e008399bdce736c7c03f7db0c3e8d624083c6b9",
|
||||||
"support"
|
"support"
|
||||||
],
|
],
|
||||||
"./.pyup.yml": [
|
"./.pyup.yml": [
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
[submodule "tools/html5lib/html5lib/tests/testdata"]
|
|
||||||
path = tools/html5lib/html5lib/tests/testdata
|
|
||||||
url = https://github.com/html5lib/html5lib-tests.git
|
|
||||||
[submodule "resources/webidl2/test/widlproc"]
|
[submodule "resources/webidl2/test/widlproc"]
|
||||||
path = resources/webidl2/test/widlproc
|
path = resources/webidl2/test/widlproc
|
||||||
url = https://github.com/dontcallmedom/widlproc.git
|
url = https://github.com/dontcallmedom/widlproc.git
|
|
@ -1,20 +0,0 @@
|
||||||
# Because we never want compiled Python
|
|
||||||
__pycache__/
|
|
||||||
*.pyc
|
|
||||||
|
|
||||||
# Ignore stuff produced by distutils
|
|
||||||
/build/
|
|
||||||
/dist/
|
|
||||||
/MANIFEST
|
|
||||||
|
|
||||||
# Generated by parse.py -p
|
|
||||||
stats.prof
|
|
||||||
|
|
||||||
# From cover (esp. in combination with nose)
|
|
||||||
.coverage
|
|
||||||
|
|
||||||
# Because tox's data is inherently local
|
|
||||||
/.tox/
|
|
||||||
|
|
||||||
# We have no interest in built Sphinx files
|
|
||||||
/doc/_build
|
|
|
@ -1,37 +0,0 @@
|
||||||
language: python
|
|
||||||
python:
|
|
||||||
- "2.6"
|
|
||||||
- "2.7"
|
|
||||||
- "3.2"
|
|
||||||
- "3.3"
|
|
||||||
- "3.4"
|
|
||||||
- "pypy"
|
|
||||||
|
|
||||||
env:
|
|
||||||
- USE_OPTIONAL=true
|
|
||||||
- USE_OPTIONAL=false
|
|
||||||
|
|
||||||
matrix:
|
|
||||||
exclude:
|
|
||||||
- python: "2.7"
|
|
||||||
env: USE_OPTIONAL=false
|
|
||||||
- python: "3.4"
|
|
||||||
env: USE_OPTIONAL=false
|
|
||||||
include:
|
|
||||||
- python: "2.7"
|
|
||||||
env: USE_OPTIONAL=false FLAKE=true
|
|
||||||
- python: "3.4"
|
|
||||||
env: USE_OPTIONAL=false FLAKE=true
|
|
||||||
|
|
||||||
before_install:
|
|
||||||
- git submodule update --init --recursive
|
|
||||||
|
|
||||||
install:
|
|
||||||
- bash requirements-install.sh
|
|
||||||
|
|
||||||
script:
|
|
||||||
- nosetests
|
|
||||||
- bash flake8-run.sh
|
|
||||||
|
|
||||||
after_script:
|
|
||||||
- python debug-info.py
|
|
|
@ -1,171 +0,0 @@
|
||||||
Change Log
|
|
||||||
----------
|
|
||||||
|
|
||||||
0.9999
|
|
||||||
~~~~~~
|
|
||||||
|
|
||||||
Released on XXX, 2014
|
|
||||||
|
|
||||||
* XXX
|
|
||||||
|
|
||||||
|
|
||||||
0.999
|
|
||||||
~~~~~
|
|
||||||
|
|
||||||
Released on December 23, 2013
|
|
||||||
|
|
||||||
* Fix #127: add work-around for CPython issue #20007: .read(0) on
|
|
||||||
http.client.HTTPResponse drops the rest of the content.
|
|
||||||
|
|
||||||
* Fix #115: lxml treewalker can now deal with fragments containing, at
|
|
||||||
their root level, text nodes with non-ASCII characters on Python 2.
|
|
||||||
|
|
||||||
|
|
||||||
0.99
|
|
||||||
~~~~
|
|
||||||
|
|
||||||
Released on September 10, 2013
|
|
||||||
|
|
||||||
* No library changes from 1.0b3; released as 0.99 as pip has changed
|
|
||||||
behaviour from 1.4 to avoid installing pre-release versions per
|
|
||||||
PEP 440.
|
|
||||||
|
|
||||||
|
|
||||||
1.0b3
|
|
||||||
~~~~~
|
|
||||||
|
|
||||||
Released on July 24, 2013
|
|
||||||
|
|
||||||
* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any
|
|
||||||
implementation using it should be moved to
|
|
||||||
``NonRecursiveTreeWalker``, as everything bundled with html5lib has
|
|
||||||
for years.
|
|
||||||
|
|
||||||
* Fix #67 so that ``BufferedStream`` to correctly returns a bytes
|
|
||||||
object, thereby fixing any case where html5lib is passed a
|
|
||||||
non-seekable RawIOBase-like object.
|
|
||||||
|
|
||||||
|
|
||||||
1.0b2
|
|
||||||
~~~~~
|
|
||||||
|
|
||||||
Released on June 27, 2013
|
|
||||||
|
|
||||||
* Removed reordering of attributes within the serializer. There is now
|
|
||||||
an ``alphabetical_attributes`` option which preserves the previous
|
|
||||||
behaviour through a new filter. This allows attribute order to be
|
|
||||||
preserved through html5lib if the tree builder preserves order.
|
|
||||||
|
|
||||||
* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
|
|
||||||
``treeadapters.sax.to_sax`` which is generic and supports any
|
|
||||||
treewalker; it also resolves all known bugs with ``dom2sax``.
|
|
||||||
|
|
||||||
* Fix treewalker assertions on hitting bytes strings on
|
|
||||||
Python 2. Previous to 1.0b1, treewalkers coped with mixed
|
|
||||||
bytes/unicode data on Python 2; this reintroduces this prior
|
|
||||||
behaviour on Python 2. Behaviour is unchanged on Python 3.
|
|
||||||
|
|
||||||
|
|
||||||
1.0b1
|
|
||||||
~~~~~
|
|
||||||
|
|
||||||
Released on May 17, 2013
|
|
||||||
|
|
||||||
* Implementation updated to implement the `HTML specification
|
|
||||||
<http://www.whatwg.org/specs/web-apps/current-work/>`_ as of 5th May
|
|
||||||
2013 (`SVN <http://svn.whatwg.org/webapps/>`_ revision r7867).
|
|
||||||
|
|
||||||
* Python 3.2+ supported in a single codebase using the ``six`` library.
|
|
||||||
|
|
||||||
* Removed support for Python 2.5 and older.
|
|
||||||
|
|
||||||
* Removed the deprecated Beautiful Soup 3 treebuilder.
|
|
||||||
``beautifulsoup4`` can use ``html5lib`` as a parser instead. Note that
|
|
||||||
since it doesn't support namespaces, foreign content like SVG and
|
|
||||||
MathML is parsed incorrectly.
|
|
||||||
|
|
||||||
* Removed ``simpletree`` from the package. The default tree builder is
|
|
||||||
now ``etree`` (using the ``xml.etree.cElementTree`` implementation if
|
|
||||||
available, and ``xml.etree.ElementTree`` otherwise).
|
|
||||||
|
|
||||||
* Removed the ``XHTMLSerializer`` as it never actually guaranteed its
|
|
||||||
output was well-formed XML, and hence provided little of use.
|
|
||||||
|
|
||||||
* Removed default DOM treebuilder, so ``html5lib.treebuilders.dom`` is no
|
|
||||||
longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
|
|
||||||
return the default DOM treebuilder, which uses ``xml.dom.minidom``.
|
|
||||||
|
|
||||||
* Optional heuristic character encoding detection now based on
|
|
||||||
``charade`` for Python 2.6 - 3.3 compatibility.
|
|
||||||
|
|
||||||
* Optional ``Genshi`` treewalker support fixed.
|
|
||||||
|
|
||||||
* Many bugfixes, including:
|
|
||||||
|
|
||||||
* #33: null in attribute value breaks XML AttValue;
|
|
||||||
|
|
||||||
* #4: nested, indirect descendant, <button> causes infinite loop;
|
|
||||||
|
|
||||||
* `Google Code 215
|
|
||||||
<http://code.google.com/p/html5lib/issues/detail?id=215>`_: Properly
|
|
||||||
detect seekable streams;
|
|
||||||
|
|
||||||
* `Google Code 206
|
|
||||||
<http://code.google.com/p/html5lib/issues/detail?id=206>`_: add
|
|
||||||
support for <video preload=...>, <audio preload=...>;
|
|
||||||
|
|
||||||
* `Google Code 205
|
|
||||||
<http://code.google.com/p/html5lib/issues/detail?id=205>`_: add
|
|
||||||
support for <video poster=...>;
|
|
||||||
|
|
||||||
* `Google Code 202
|
|
||||||
<http://code.google.com/p/html5lib/issues/detail?id=202>`_: Unicode
|
|
||||||
file breaks InputStream.
|
|
||||||
|
|
||||||
* Source code is now mostly PEP 8 compliant.
|
|
||||||
|
|
||||||
* Test harness has been improved and now depends on ``nose``.
|
|
||||||
|
|
||||||
* Documentation updated and moved to http://html5lib.readthedocs.org/.
|
|
||||||
|
|
||||||
|
|
||||||
0.95
|
|
||||||
~~~~
|
|
||||||
|
|
||||||
Released on February 11, 2012
|
|
||||||
|
|
||||||
|
|
||||||
0.90
|
|
||||||
~~~~
|
|
||||||
|
|
||||||
Released on January 17, 2010
|
|
||||||
|
|
||||||
|
|
||||||
0.11.1
|
|
||||||
~~~~~~
|
|
||||||
|
|
||||||
Released on June 12, 2008
|
|
||||||
|
|
||||||
|
|
||||||
0.11
|
|
||||||
~~~~
|
|
||||||
|
|
||||||
Released on June 10, 2008
|
|
||||||
|
|
||||||
|
|
||||||
0.10
|
|
||||||
~~~~
|
|
||||||
|
|
||||||
Released on October 7, 2007
|
|
||||||
|
|
||||||
|
|
||||||
0.9
|
|
||||||
~~~
|
|
||||||
|
|
||||||
Released on March 11, 2007
|
|
||||||
|
|
||||||
|
|
||||||
0.2
|
|
||||||
~~~
|
|
||||||
|
|
||||||
Released on January 8, 2007
|
|
|
@ -1,77 +0,0 @@
|
||||||
html5lib Package
|
|
||||||
================
|
|
||||||
|
|
||||||
:mod:`html5lib` Package
|
|
||||||
-----------------------
|
|
||||||
|
|
||||||
.. automodule:: html5lib.__init__
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
:mod:`constants` Module
|
|
||||||
-----------------------
|
|
||||||
|
|
||||||
.. automodule:: html5lib.constants
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
:mod:`html5parser` Module
|
|
||||||
-------------------------
|
|
||||||
|
|
||||||
.. automodule:: html5lib.html5parser
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
:mod:`ihatexml` Module
|
|
||||||
----------------------
|
|
||||||
|
|
||||||
.. automodule:: html5lib.ihatexml
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
:mod:`inputstream` Module
|
|
||||||
-------------------------
|
|
||||||
|
|
||||||
.. automodule:: html5lib.inputstream
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
:mod:`sanitizer` Module
|
|
||||||
-----------------------
|
|
||||||
|
|
||||||
.. automodule:: html5lib.sanitizer
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
:mod:`tokenizer` Module
|
|
||||||
-----------------------
|
|
||||||
|
|
||||||
.. automodule:: html5lib.tokenizer
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
:mod:`utils` Module
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
.. automodule:: html5lib.utils
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
Subpackages
|
|
||||||
-----------
|
|
||||||
|
|
||||||
.. toctree::
|
|
||||||
|
|
||||||
html5lib.filters
|
|
||||||
html5lib.serializer
|
|
||||||
html5lib.treebuilders
|
|
||||||
html5lib.treewalkers
|
|
||||||
|
|
|
@ -1,19 +0,0 @@
|
||||||
serializer Package
|
|
||||||
==================
|
|
||||||
|
|
||||||
:mod:`serializer` Package
|
|
||||||
-------------------------
|
|
||||||
|
|
||||||
.. automodule:: html5lib.serializer
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
:mod:`htmlserializer` Module
|
|
||||||
----------------------------
|
|
||||||
|
|
||||||
.. automodule:: html5lib.serializer.htmlserializer
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
|
@ -1,59 +0,0 @@
|
||||||
treewalkers Package
|
|
||||||
===================
|
|
||||||
|
|
||||||
:mod:`treewalkers` Package
|
|
||||||
--------------------------
|
|
||||||
|
|
||||||
.. automodule:: html5lib.treewalkers
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
:mod:`_base` Module
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
.. automodule:: html5lib.treewalkers._base
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
:mod:`dom` Module
|
|
||||||
-----------------
|
|
||||||
|
|
||||||
.. automodule:: html5lib.treewalkers.dom
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
:mod:`etree` Module
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
.. automodule:: html5lib.treewalkers.etree
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
:mod:`genshistream` Module
|
|
||||||
--------------------------
|
|
||||||
|
|
||||||
.. automodule:: html5lib.treewalkers.genshistream
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
:mod:`lxmletree` Module
|
|
||||||
-----------------------
|
|
||||||
|
|
||||||
.. automodule:: html5lib.treewalkers.lxmletree
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
:mod:`pulldom` Module
|
|
||||||
---------------------
|
|
||||||
|
|
||||||
.. automodule:: html5lib.treewalkers.pulldom
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
|
@ -1,14 +0,0 @@
|
||||||
#!/bin/bash -e
|
|
||||||
|
|
||||||
if [[ ! -x $(which flake8) ]]; then
|
|
||||||
echo "fatal: flake8 not found on $PATH. Exiting."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $TRAVIS != "true" || $FLAKE == "true" ]]; then
|
|
||||||
find html5lib/ -name '*.py' -and -not -name 'constants.py' -print0 | xargs -0 flake8 --ignore=E501
|
|
||||||
flake1=$?
|
|
||||||
flake8 --max-line-length=99 --ignore=E126 html5lib/constants.py
|
|
||||||
flake2=$?
|
|
||||||
exit $[$flake1 || $flake2]
|
|
||||||
fi
|
|
|
@ -1,23 +0,0 @@
|
||||||
"""
|
|
||||||
HTML parsing library based on the WHATWG "HTML5"
|
|
||||||
specification. The parser is designed to be compatible with existing
|
|
||||||
HTML found in the wild and implements well-defined error recovery that
|
|
||||||
is largely compatible with modern desktop web browsers.
|
|
||||||
|
|
||||||
Example usage:
|
|
||||||
|
|
||||||
import html5lib
|
|
||||||
f = open("my_document.html")
|
|
||||||
tree = html5lib.parse(f)
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from .html5parser import HTMLParser, parse, parseFragment
|
|
||||||
from .treebuilders import getTreeBuilder
|
|
||||||
from .treewalkers import getTreeWalker
|
|
||||||
from .serializer import serialize
|
|
||||||
|
|
||||||
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
|
|
||||||
"getTreeWalker", "serialize"]
|
|
||||||
__version__ = "0.9999-dev"
|
|
|
@ -1,20 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
|
|
||||||
try:
|
|
||||||
from collections import OrderedDict
|
|
||||||
except ImportError:
|
|
||||||
from ordereddict import OrderedDict
|
|
||||||
|
|
||||||
|
|
||||||
class Filter(_base.Filter):
|
|
||||||
def __iter__(self):
|
|
||||||
for token in _base.Filter.__iter__(self):
|
|
||||||
if token["type"] in ("StartTag", "EmptyTag"):
|
|
||||||
attrs = OrderedDict()
|
|
||||||
for name, value in sorted(token["data"].items(),
|
|
||||||
key=lambda x: x[0]):
|
|
||||||
attrs[name] = value
|
|
||||||
token["data"] = attrs
|
|
||||||
yield token
|
|
|
@ -1,93 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from gettext import gettext
|
|
||||||
_ = gettext
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
from ..constants import cdataElements, rcdataElements, voidElements
|
|
||||||
|
|
||||||
from ..constants import spaceCharacters
|
|
||||||
spaceCharacters = "".join(spaceCharacters)
|
|
||||||
|
|
||||||
|
|
||||||
class LintError(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class Filter(_base.Filter):
|
|
||||||
def __iter__(self):
|
|
||||||
open_elements = []
|
|
||||||
contentModelFlag = "PCDATA"
|
|
||||||
for token in _base.Filter.__iter__(self):
|
|
||||||
type = token["type"]
|
|
||||||
if type in ("StartTag", "EmptyTag"):
|
|
||||||
name = token["name"]
|
|
||||||
if contentModelFlag != "PCDATA":
|
|
||||||
raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name})
|
|
||||||
if not isinstance(name, str):
|
|
||||||
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
|
|
||||||
if not name:
|
|
||||||
raise LintError(_("Empty tag name"))
|
|
||||||
if type == "StartTag" and name in voidElements:
|
|
||||||
raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name})
|
|
||||||
elif type == "EmptyTag" and name not in voidElements:
|
|
||||||
raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]})
|
|
||||||
if type == "StartTag":
|
|
||||||
open_elements.append(name)
|
|
||||||
for name, value in token["data"]:
|
|
||||||
if not isinstance(name, str):
|
|
||||||
raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name})
|
|
||||||
if not name:
|
|
||||||
raise LintError(_("Empty attribute name"))
|
|
||||||
if not isinstance(value, str):
|
|
||||||
raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value})
|
|
||||||
if name in cdataElements:
|
|
||||||
contentModelFlag = "CDATA"
|
|
||||||
elif name in rcdataElements:
|
|
||||||
contentModelFlag = "RCDATA"
|
|
||||||
elif name == "plaintext":
|
|
||||||
contentModelFlag = "PLAINTEXT"
|
|
||||||
|
|
||||||
elif type == "EndTag":
|
|
||||||
name = token["name"]
|
|
||||||
if not isinstance(name, str):
|
|
||||||
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
|
|
||||||
if not name:
|
|
||||||
raise LintError(_("Empty tag name"))
|
|
||||||
if name in voidElements:
|
|
||||||
raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name})
|
|
||||||
start_name = open_elements.pop()
|
|
||||||
if start_name != name:
|
|
||||||
raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name})
|
|
||||||
contentModelFlag = "PCDATA"
|
|
||||||
|
|
||||||
elif type == "Comment":
|
|
||||||
if contentModelFlag != "PCDATA":
|
|
||||||
raise LintError(_("Comment not in PCDATA content model flag"))
|
|
||||||
|
|
||||||
elif type in ("Characters", "SpaceCharacters"):
|
|
||||||
data = token["data"]
|
|
||||||
if not isinstance(data, str):
|
|
||||||
raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data})
|
|
||||||
if not data:
|
|
||||||
raise LintError(_("%(type)s token with empty data") % {"type": type})
|
|
||||||
if type == "SpaceCharacters":
|
|
||||||
data = data.strip(spaceCharacters)
|
|
||||||
if data:
|
|
||||||
raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data})
|
|
||||||
|
|
||||||
elif type == "Doctype":
|
|
||||||
name = token["name"]
|
|
||||||
if contentModelFlag != "PCDATA":
|
|
||||||
raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name})
|
|
||||||
if not isinstance(name, str):
|
|
||||||
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
|
|
||||||
# XXX: what to do with token["data"] ?
|
|
||||||
|
|
||||||
elif type in ("ParseError", "SerializeError"):
|
|
||||||
pass
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise LintError(_("Unknown token type: %(type)s") % {"type": type})
|
|
||||||
|
|
||||||
yield token
|
|
|
@ -1,12 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
from ..sanitizer import HTMLSanitizerMixin
|
|
||||||
|
|
||||||
|
|
||||||
class Filter(_base.Filter, HTMLSanitizerMixin):
|
|
||||||
def __iter__(self):
|
|
||||||
for token in _base.Filter.__iter__(self):
|
|
||||||
token = self.sanitize_token(token)
|
|
||||||
if token:
|
|
||||||
yield token
|
|
|
@ -1,271 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
import re
|
|
||||||
from xml.sax.saxutils import escape, unescape
|
|
||||||
|
|
||||||
from .tokenizer import HTMLTokenizer
|
|
||||||
from .constants import tokenTypes
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLSanitizerMixin(object):
|
|
||||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
|
||||||
|
|
||||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
|
|
||||||
'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
|
|
||||||
'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
|
|
||||||
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
|
|
||||||
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
|
|
||||||
'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
|
|
||||||
'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
|
|
||||||
'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
|
|
||||||
'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
|
|
||||||
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
|
|
||||||
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
|
|
||||||
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
|
|
||||||
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
|
|
||||||
|
|
||||||
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
|
|
||||||
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
|
|
||||||
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
|
|
||||||
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
|
|
||||||
'munderover', 'none']
|
|
||||||
|
|
||||||
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
|
||||||
'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
|
|
||||||
'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
|
|
||||||
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
|
||||||
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
|
||||||
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
|
||||||
|
|
||||||
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
|
||||||
'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
|
|
||||||
'background', 'balance', 'bgcolor', 'bgproperties', 'border',
|
|
||||||
'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
|
|
||||||
'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
|
|
||||||
'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
|
|
||||||
'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
|
|
||||||
'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
|
|
||||||
'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
|
|
||||||
'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
|
|
||||||
'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
|
|
||||||
'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
|
|
||||||
'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
|
|
||||||
'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
|
|
||||||
'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
|
|
||||||
'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
|
|
||||||
'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
|
|
||||||
'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
|
|
||||||
'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
|
|
||||||
'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
|
|
||||||
'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
|
|
||||||
'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
|
|
||||||
'width', 'wrap', 'xml:lang']
|
|
||||||
|
|
||||||
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
|
|
||||||
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
|
|
||||||
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
|
|
||||||
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
|
|
||||||
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
|
|
||||||
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
|
|
||||||
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
|
|
||||||
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
|
|
||||||
'xlink:type', 'xmlns', 'xmlns:xlink']
|
|
||||||
|
|
||||||
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
|
||||||
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
|
||||||
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
|
||||||
'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
|
|
||||||
'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
|
|
||||||
'fill-opacity', 'fill-rule', 'font-family', 'font-size',
|
|
||||||
'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
|
|
||||||
'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
|
|
||||||
'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
|
|
||||||
'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
|
|
||||||
'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
|
|
||||||
'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
|
|
||||||
'opacity', 'orient', 'origin', 'overline-position',
|
|
||||||
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
|
|
||||||
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
|
|
||||||
'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
|
|
||||||
'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
|
|
||||||
'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
|
|
||||||
'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
|
|
||||||
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
|
|
||||||
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
|
|
||||||
'transform', 'type', 'u1', 'u2', 'underline-position',
|
|
||||||
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
|
|
||||||
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
|
|
||||||
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
|
|
||||||
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
|
|
||||||
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
|
|
||||||
'y1', 'y2', 'zoomAndPan']
|
|
||||||
|
|
||||||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster',
|
|
||||||
'xlink:href', 'xml:base']
|
|
||||||
|
|
||||||
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
|
|
||||||
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
|
|
||||||
'mask', 'stroke']
|
|
||||||
|
|
||||||
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
|
|
||||||
'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
|
|
||||||
'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
|
|
||||||
'set', 'use']
|
|
||||||
|
|
||||||
acceptable_css_properties = ['azimuth', 'background-color',
|
|
||||||
'border-bottom-color', 'border-collapse', 'border-color',
|
|
||||||
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
|
||||||
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
|
|
||||||
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
|
|
||||||
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
|
|
||||||
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
|
|
||||||
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
|
||||||
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
|
|
||||||
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
|
|
||||||
'white-space', 'width']
|
|
||||||
|
|
||||||
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
|
|
||||||
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
|
|
||||||
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
|
|
||||||
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
|
|
||||||
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
|
|
||||||
'transparent', 'underline', 'white', 'yellow']
|
|
||||||
|
|
||||||
acceptable_svg_properties = ['fill', 'fill-opacity', 'fill-rule',
|
|
||||||
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
|
||||||
'stroke-opacity']
|
|
||||||
|
|
||||||
acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
|
|
||||||
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
|
|
||||||
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
|
|
||||||
'ssh', 'sftp', 'rtsp', 'afs']
|
|
||||||
|
|
||||||
# subclasses may define their own versions of these constants
|
|
||||||
allowed_elements = acceptable_elements + mathml_elements + svg_elements
|
|
||||||
allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
|
|
||||||
allowed_css_properties = acceptable_css_properties
|
|
||||||
allowed_css_keywords = acceptable_css_keywords
|
|
||||||
allowed_svg_properties = acceptable_svg_properties
|
|
||||||
allowed_protocols = acceptable_protocols
|
|
||||||
|
|
||||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
|
||||||
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
|
||||||
# attributes are parsed, and a restricted set, # specified by
|
|
||||||
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
|
||||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
|
|
||||||
# in ALLOWED_PROTOCOLS are allowed.
|
|
||||||
#
|
|
||||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
|
||||||
# => <script> do_nasty_stuff() </script>
|
|
||||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
|
||||||
# => <a>Click here for $100</a>
|
|
||||||
def sanitize_token(self, token):
|
|
||||||
|
|
||||||
# accommodate filters which use token_type differently
|
|
||||||
token_type = token["type"]
|
|
||||||
if token_type in list(tokenTypes.keys()):
|
|
||||||
token_type = tokenTypes[token_type]
|
|
||||||
|
|
||||||
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
|
|
||||||
tokenTypes["EmptyTag"]):
|
|
||||||
if token["name"] in self.allowed_elements:
|
|
||||||
return self.allowed_token(token, token_type)
|
|
||||||
else:
|
|
||||||
return self.disallowed_token(token, token_type)
|
|
||||||
elif token_type == tokenTypes["Comment"]:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
return token
|
|
||||||
|
|
||||||
def allowed_token(self, token, token_type):
|
|
||||||
if "data" in token:
|
|
||||||
attrs = dict([(name, val) for name, val in
|
|
||||||
token["data"][::-1]
|
|
||||||
if name in self.allowed_attributes])
|
|
||||||
for attr in self.attr_val_is_uri:
|
|
||||||
if attr not in attrs:
|
|
||||||
continue
|
|
||||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
|
|
||||||
unescape(attrs[attr])).lower()
|
|
||||||
# remove replacement characters from unescaped characters
|
|
||||||
val_unescaped = val_unescaped.replace("\ufffd", "")
|
|
||||||
if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
|
|
||||||
(val_unescaped.split(':')[0] not in
|
|
||||||
self.allowed_protocols)):
|
|
||||||
del attrs[attr]
|
|
||||||
for attr in self.svg_attr_val_allows_ref:
|
|
||||||
if attr in attrs:
|
|
||||||
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
|
||||||
' ',
|
|
||||||
unescape(attrs[attr]))
|
|
||||||
if (token["name"] in self.svg_allow_local_href and
|
|
||||||
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
|
|
||||||
attrs['xlink:href'])):
|
|
||||||
del attrs['xlink:href']
|
|
||||||
if 'style' in attrs:
|
|
||||||
attrs['style'] = self.sanitize_css(attrs['style'])
|
|
||||||
token["data"] = [[name, val] for name, val in list(attrs.items())]
|
|
||||||
return token
|
|
||||||
|
|
||||||
def disallowed_token(self, token, token_type):
|
|
||||||
if token_type == tokenTypes["EndTag"]:
|
|
||||||
token["data"] = "</%s>" % token["name"]
|
|
||||||
elif token["data"]:
|
|
||||||
attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
|
|
||||||
token["data"] = "<%s%s>" % (token["name"], attrs)
|
|
||||||
else:
|
|
||||||
token["data"] = "<%s>" % token["name"]
|
|
||||||
if token.get("selfClosing"):
|
|
||||||
token["data"] = token["data"][:-1] + "/>"
|
|
||||||
|
|
||||||
if token["type"] in list(tokenTypes.keys()):
|
|
||||||
token["type"] = "Characters"
|
|
||||||
else:
|
|
||||||
token["type"] = tokenTypes["Characters"]
|
|
||||||
|
|
||||||
del token["name"]
|
|
||||||
return token
|
|
||||||
|
|
||||||
def sanitize_css(self, style):
|
|
||||||
# disallow urls
|
|
||||||
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
|
||||||
|
|
||||||
# gauntlet
|
|
||||||
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
|
|
||||||
return ''
|
|
||||||
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
|
||||||
return ''
|
|
||||||
|
|
||||||
clean = []
|
|
||||||
for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
|
|
||||||
if not value:
|
|
||||||
continue
|
|
||||||
if prop.lower() in self.allowed_css_properties:
|
|
||||||
clean.append(prop + ': ' + value + ';')
|
|
||||||
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
|
|
||||||
'padding']:
|
|
||||||
for keyword in value.split():
|
|
||||||
if keyword not in self.acceptable_css_keywords and \
|
|
||||||
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
clean.append(prop + ': ' + value + ';')
|
|
||||||
elif prop.lower() in self.allowed_svg_properties:
|
|
||||||
clean.append(prop + ': ' + value + ';')
|
|
||||||
|
|
||||||
return ' '.join(clean)
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
|
|
||||||
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
|
||||||
lowercaseElementName=False, lowercaseAttrName=False, parser=None):
|
|
||||||
# Change case matching defaults as we only output lowercase html anyway
|
|
||||||
# This solution doesn't seem ideal...
|
|
||||||
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
|
|
||||||
lowercaseElementName, lowercaseAttrName, parser=parser)
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
for token in HTMLTokenizer.__iter__(self):
|
|
||||||
token = self.sanitize_token(token)
|
|
||||||
if token:
|
|
||||||
yield token
|
|
|
@ -1,16 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from .. import treewalkers
|
|
||||||
|
|
||||||
from .htmlserializer import HTMLSerializer
|
|
||||||
|
|
||||||
|
|
||||||
def serialize(input, tree="etree", format="html", encoding=None,
|
|
||||||
**serializer_opts):
|
|
||||||
# XXX: Should we cache this?
|
|
||||||
walker = treewalkers.getTreeWalker(tree)
|
|
||||||
if format == "html":
|
|
||||||
s = HTMLSerializer(**serializer_opts)
|
|
||||||
else:
|
|
||||||
raise ValueError("type must be html")
|
|
||||||
return s.render(walker(input), encoding)
|
|
|
@ -1,320 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
from six import text_type
|
|
||||||
|
|
||||||
import gettext
|
|
||||||
_ = gettext.gettext
|
|
||||||
|
|
||||||
try:
|
|
||||||
from functools import reduce
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
from ..constants import voidElements, booleanAttributes, spaceCharacters
|
|
||||||
from ..constants import rcdataElements, entities, xmlEntities
|
|
||||||
from .. import utils
|
|
||||||
from xml.sax.saxutils import escape
|
|
||||||
|
|
||||||
spaceCharacters = "".join(spaceCharacters)
|
|
||||||
|
|
||||||
try:
|
|
||||||
from codecs import register_error, xmlcharrefreplace_errors
|
|
||||||
except ImportError:
|
|
||||||
unicode_encode_errors = "strict"
|
|
||||||
else:
|
|
||||||
unicode_encode_errors = "htmlentityreplace"
|
|
||||||
|
|
||||||
encode_entity_map = {}
|
|
||||||
is_ucs4 = len("\U0010FFFF") == 1
|
|
||||||
for k, v in list(entities.items()):
|
|
||||||
# skip multi-character entities
|
|
||||||
if ((is_ucs4 and len(v) > 1) or
|
|
||||||
(not is_ucs4 and len(v) > 2)):
|
|
||||||
continue
|
|
||||||
if v != "&":
|
|
||||||
if len(v) == 2:
|
|
||||||
v = utils.surrogatePairToCodepoint(v)
|
|
||||||
else:
|
|
||||||
v = ord(v)
|
|
||||||
if v not in encode_entity_map or k.islower():
|
|
||||||
# prefer < over < and similarly for &, >, etc.
|
|
||||||
encode_entity_map[v] = k
|
|
||||||
|
|
||||||
def htmlentityreplace_errors(exc):
|
|
||||||
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
|
||||||
res = []
|
|
||||||
codepoints = []
|
|
||||||
skip = False
|
|
||||||
for i, c in enumerate(exc.object[exc.start:exc.end]):
|
|
||||||
if skip:
|
|
||||||
skip = False
|
|
||||||
continue
|
|
||||||
index = i + exc.start
|
|
||||||
if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
|
|
||||||
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
|
|
||||||
skip = True
|
|
||||||
else:
|
|
||||||
codepoint = ord(c)
|
|
||||||
codepoints.append(codepoint)
|
|
||||||
for cp in codepoints:
|
|
||||||
e = encode_entity_map.get(cp)
|
|
||||||
if e:
|
|
||||||
res.append("&")
|
|
||||||
res.append(e)
|
|
||||||
if not e.endswith(";"):
|
|
||||||
res.append(";")
|
|
||||||
else:
|
|
||||||
res.append("&#x%s;" % (hex(cp)[2:]))
|
|
||||||
return ("".join(res), exc.end)
|
|
||||||
else:
|
|
||||||
return xmlcharrefreplace_errors(exc)
|
|
||||||
|
|
||||||
register_error(unicode_encode_errors, htmlentityreplace_errors)
|
|
||||||
|
|
||||||
del register_error
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLSerializer(object):
|
|
||||||
|
|
||||||
# attribute quoting options
|
|
||||||
quote_attr_values = False
|
|
||||||
quote_char = '"'
|
|
||||||
use_best_quote_char = True
|
|
||||||
|
|
||||||
# tag syntax options
|
|
||||||
omit_optional_tags = True
|
|
||||||
minimize_boolean_attributes = True
|
|
||||||
use_trailing_solidus = False
|
|
||||||
space_before_trailing_solidus = True
|
|
||||||
|
|
||||||
# escaping options
|
|
||||||
escape_lt_in_attrs = False
|
|
||||||
escape_rcdata = False
|
|
||||||
resolve_entities = True
|
|
||||||
|
|
||||||
# miscellaneous options
|
|
||||||
alphabetical_attributes = False
|
|
||||||
inject_meta_charset = True
|
|
||||||
strip_whitespace = False
|
|
||||||
sanitize = False
|
|
||||||
|
|
||||||
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
|
||||||
"omit_optional_tags", "minimize_boolean_attributes",
|
|
||||||
"use_trailing_solidus", "space_before_trailing_solidus",
|
|
||||||
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
|
|
||||||
"alphabetical_attributes", "inject_meta_charset",
|
|
||||||
"strip_whitespace", "sanitize")
|
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
|
||||||
"""Initialize HTMLSerializer.
|
|
||||||
|
|
||||||
Keyword options (default given first unless specified) include:
|
|
||||||
|
|
||||||
inject_meta_charset=True|False
|
|
||||||
Whether it insert a meta element to define the character set of the
|
|
||||||
document.
|
|
||||||
quote_attr_values=True|False
|
|
||||||
Whether to quote attribute values that don't require quoting
|
|
||||||
per HTML5 parsing rules.
|
|
||||||
quote_char=u'"'|u"'"
|
|
||||||
Use given quote character for attribute quoting. Default is to
|
|
||||||
use double quote unless attribute value contains a double quote,
|
|
||||||
in which case single quotes are used instead.
|
|
||||||
escape_lt_in_attrs=False|True
|
|
||||||
Whether to escape < in attribute values.
|
|
||||||
escape_rcdata=False|True
|
|
||||||
Whether to escape characters that need to be escaped within normal
|
|
||||||
elements within rcdata elements such as style.
|
|
||||||
resolve_entities=True|False
|
|
||||||
Whether to resolve named character entities that appear in the
|
|
||||||
source tree. The XML predefined entities < > & " '
|
|
||||||
are unaffected by this setting.
|
|
||||||
strip_whitespace=False|True
|
|
||||||
Whether to remove semantically meaningless whitespace. (This
|
|
||||||
compresses all whitespace to a single space except within pre.)
|
|
||||||
minimize_boolean_attributes=True|False
|
|
||||||
Shortens boolean attributes to give just the attribute value,
|
|
||||||
for example <input disabled="disabled"> becomes <input disabled>.
|
|
||||||
use_trailing_solidus=False|True
|
|
||||||
Includes a close-tag slash at the end of the start tag of void
|
|
||||||
elements (empty elements whose end tag is forbidden). E.g. <hr/>.
|
|
||||||
space_before_trailing_solidus=True|False
|
|
||||||
Places a space immediately before the closing slash in a tag
|
|
||||||
using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
|
|
||||||
sanitize=False|True
|
|
||||||
Strip all unsafe or unknown constructs from output.
|
|
||||||
See `html5lib user documentation`_
|
|
||||||
omit_optional_tags=True|False
|
|
||||||
Omit start/end tags that are optional.
|
|
||||||
alphabetical_attributes=False|True
|
|
||||||
Reorder attributes to be in alphabetical order.
|
|
||||||
|
|
||||||
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
|
|
||||||
"""
|
|
||||||
if 'quote_char' in kwargs:
|
|
||||||
self.use_best_quote_char = False
|
|
||||||
for attr in self.options:
|
|
||||||
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
|
|
||||||
self.errors = []
|
|
||||||
self.strict = False
|
|
||||||
|
|
||||||
def encode(self, string):
|
|
||||||
assert(isinstance(string, text_type))
|
|
||||||
if self.encoding:
|
|
||||||
return string.encode(self.encoding, unicode_encode_errors)
|
|
||||||
else:
|
|
||||||
return string
|
|
||||||
|
|
||||||
def encodeStrict(self, string):
|
|
||||||
assert(isinstance(string, text_type))
|
|
||||||
if self.encoding:
|
|
||||||
return string.encode(self.encoding, "strict")
|
|
||||||
else:
|
|
||||||
return string
|
|
||||||
|
|
||||||
def serialize(self, treewalker, encoding=None):
|
|
||||||
self.encoding = encoding
|
|
||||||
in_cdata = False
|
|
||||||
self.errors = []
|
|
||||||
|
|
||||||
if encoding and self.inject_meta_charset:
|
|
||||||
from ..filters.inject_meta_charset import Filter
|
|
||||||
treewalker = Filter(treewalker, encoding)
|
|
||||||
# WhitespaceFilter should be used before OptionalTagFilter
|
|
||||||
# for maximum efficiently of this latter filter
|
|
||||||
if self.strip_whitespace:
|
|
||||||
from ..filters.whitespace import Filter
|
|
||||||
treewalker = Filter(treewalker)
|
|
||||||
if self.sanitize:
|
|
||||||
from ..filters.sanitizer import Filter
|
|
||||||
treewalker = Filter(treewalker)
|
|
||||||
if self.omit_optional_tags:
|
|
||||||
from ..filters.optionaltags import Filter
|
|
||||||
treewalker = Filter(treewalker)
|
|
||||||
# Alphabetical attributes must be last, as other filters
|
|
||||||
# could add attributes and alter the order
|
|
||||||
if self.alphabetical_attributes:
|
|
||||||
from ..filters.alphabeticalattributes import Filter
|
|
||||||
treewalker = Filter(treewalker)
|
|
||||||
|
|
||||||
for token in treewalker:
|
|
||||||
type = token["type"]
|
|
||||||
if type == "Doctype":
|
|
||||||
doctype = "<!DOCTYPE %s" % token["name"]
|
|
||||||
|
|
||||||
if token["publicId"]:
|
|
||||||
doctype += ' PUBLIC "%s"' % token["publicId"]
|
|
||||||
elif token["systemId"]:
|
|
||||||
doctype += " SYSTEM"
|
|
||||||
if token["systemId"]:
|
|
||||||
if token["systemId"].find('"') >= 0:
|
|
||||||
if token["systemId"].find("'") >= 0:
|
|
||||||
self.serializeError(_("System identifer contains both single and double quote characters"))
|
|
||||||
quote_char = "'"
|
|
||||||
else:
|
|
||||||
quote_char = '"'
|
|
||||||
doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
|
|
||||||
|
|
||||||
doctype += ">"
|
|
||||||
yield self.encodeStrict(doctype)
|
|
||||||
|
|
||||||
elif type in ("Characters", "SpaceCharacters"):
|
|
||||||
if type == "SpaceCharacters" or in_cdata:
|
|
||||||
if in_cdata and token["data"].find("</") >= 0:
|
|
||||||
self.serializeError(_("Unexpected </ in CDATA"))
|
|
||||||
yield self.encode(token["data"])
|
|
||||||
else:
|
|
||||||
yield self.encode(escape(token["data"]))
|
|
||||||
|
|
||||||
elif type in ("StartTag", "EmptyTag"):
|
|
||||||
name = token["name"]
|
|
||||||
yield self.encodeStrict("<%s" % name)
|
|
||||||
if name in rcdataElements and not self.escape_rcdata:
|
|
||||||
in_cdata = True
|
|
||||||
elif in_cdata:
|
|
||||||
self.serializeError(_("Unexpected child element of a CDATA element"))
|
|
||||||
for (attr_namespace, attr_name), attr_value in token["data"].items():
|
|
||||||
# TODO: Add namespace support here
|
|
||||||
k = attr_name
|
|
||||||
v = attr_value
|
|
||||||
yield self.encodeStrict(' ')
|
|
||||||
|
|
||||||
yield self.encodeStrict(k)
|
|
||||||
if not self.minimize_boolean_attributes or \
|
|
||||||
(k not in booleanAttributes.get(name, tuple())
|
|
||||||
and k not in booleanAttributes.get("", tuple())):
|
|
||||||
yield self.encodeStrict("=")
|
|
||||||
if self.quote_attr_values or not v:
|
|
||||||
quote_attr = True
|
|
||||||
else:
|
|
||||||
quote_attr = reduce(lambda x, y: x or (y in v),
|
|
||||||
spaceCharacters + ">\"'=", False)
|
|
||||||
v = v.replace("&", "&")
|
|
||||||
if self.escape_lt_in_attrs:
|
|
||||||
v = v.replace("<", "<")
|
|
||||||
if quote_attr:
|
|
||||||
quote_char = self.quote_char
|
|
||||||
if self.use_best_quote_char:
|
|
||||||
if "'" in v and '"' not in v:
|
|
||||||
quote_char = '"'
|
|
||||||
elif '"' in v and "'" not in v:
|
|
||||||
quote_char = "'"
|
|
||||||
if quote_char == "'":
|
|
||||||
v = v.replace("'", "'")
|
|
||||||
else:
|
|
||||||
v = v.replace('"', """)
|
|
||||||
yield self.encodeStrict(quote_char)
|
|
||||||
yield self.encode(v)
|
|
||||||
yield self.encodeStrict(quote_char)
|
|
||||||
else:
|
|
||||||
yield self.encode(v)
|
|
||||||
if name in voidElements and self.use_trailing_solidus:
|
|
||||||
if self.space_before_trailing_solidus:
|
|
||||||
yield self.encodeStrict(" /")
|
|
||||||
else:
|
|
||||||
yield self.encodeStrict("/")
|
|
||||||
yield self.encode(">")
|
|
||||||
|
|
||||||
elif type == "EndTag":
|
|
||||||
name = token["name"]
|
|
||||||
if name in rcdataElements:
|
|
||||||
in_cdata = False
|
|
||||||
elif in_cdata:
|
|
||||||
self.serializeError(_("Unexpected child element of a CDATA element"))
|
|
||||||
yield self.encodeStrict("</%s>" % name)
|
|
||||||
|
|
||||||
elif type == "Comment":
|
|
||||||
data = token["data"]
|
|
||||||
if data.find("--") >= 0:
|
|
||||||
self.serializeError(_("Comment contains --"))
|
|
||||||
yield self.encodeStrict("<!--%s-->" % token["data"])
|
|
||||||
|
|
||||||
elif type == "Entity":
|
|
||||||
name = token["name"]
|
|
||||||
key = name + ";"
|
|
||||||
if key not in entities:
|
|
||||||
self.serializeError(_("Entity %s not recognized" % name))
|
|
||||||
if self.resolve_entities and key not in xmlEntities:
|
|
||||||
data = entities[key]
|
|
||||||
else:
|
|
||||||
data = "&%s;" % name
|
|
||||||
yield self.encodeStrict(data)
|
|
||||||
|
|
||||||
else:
|
|
||||||
self.serializeError(token["data"])
|
|
||||||
|
|
||||||
def render(self, treewalker, encoding=None):
|
|
||||||
if encoding:
|
|
||||||
return b"".join(list(self.serialize(treewalker, encoding)))
|
|
||||||
else:
|
|
||||||
return "".join(list(self.serialize(treewalker)))
|
|
||||||
|
|
||||||
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
|
|
||||||
# XXX The idea is to make data mandatory.
|
|
||||||
self.errors.append(data)
|
|
||||||
if self.strict:
|
|
||||||
raise SerializeError
|
|
||||||
|
|
||||||
|
|
||||||
def SerializeError(Exception):
|
|
||||||
"""Error in serialized tree"""
|
|
||||||
pass
|
|
|
@ -1 +0,0 @@
|
||||||
Each testcase file can be run through nose (using ``nosetests``).
|
|
|
@ -1,41 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
# Allow us to import from the src directory
|
|
||||||
os.chdir(os.path.split(os.path.abspath(__file__))[0])
|
|
||||||
sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, "src")))
|
|
||||||
|
|
||||||
from html5lib.tokenizer import HTMLTokenizer
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLParser(object):
|
|
||||||
""" Fake parser to test tokenizer output """
|
|
||||||
def parse(self, stream, output=True):
|
|
||||||
tokenizer = HTMLTokenizer(stream)
|
|
||||||
for token in tokenizer:
|
|
||||||
if output:
|
|
||||||
print(token)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
x = HTMLParser()
|
|
||||||
if len(sys.argv) > 1:
|
|
||||||
if len(sys.argv) > 2:
|
|
||||||
import hotshot
|
|
||||||
import hotshot.stats
|
|
||||||
prof = hotshot.Profile('stats.prof')
|
|
||||||
prof.runcall(x.parse, sys.argv[1], False)
|
|
||||||
prof.close()
|
|
||||||
stats = hotshot.stats.load('stats.prof')
|
|
||||||
stats.strip_dirs()
|
|
||||||
stats.sort_stats('time')
|
|
||||||
stats.print_stats()
|
|
||||||
else:
|
|
||||||
x.parse(sys.argv[1])
|
|
||||||
else:
|
|
||||||
print("""Usage: python mockParser.py filename [stats]
|
|
||||||
If stats is specified the hotshots profiler will run and output the
|
|
||||||
stats instead.
|
|
||||||
""")
|
|
|
@ -1,36 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
def f1():
|
|
||||||
x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
||||||
y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
||||||
z = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
||||||
x += y + z
|
|
||||||
|
|
||||||
|
|
||||||
def f2():
|
|
||||||
x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
||||||
y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
||||||
z = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
||||||
x = x + y + z
|
|
||||||
|
|
||||||
|
|
||||||
def f3():
|
|
||||||
x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
||||||
y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
||||||
z = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
||||||
x = "".join((x, y, z))
|
|
||||||
|
|
||||||
|
|
||||||
def f4():
|
|
||||||
x = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
||||||
y = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
||||||
z = "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
||||||
x = "%s%s%s" % (x, y, z)
|
|
||||||
|
|
||||||
import timeit
|
|
||||||
for x in range(4):
|
|
||||||
statement = "f%s" % (x + 1)
|
|
||||||
t = timeit.Timer(statement, "from __main__ import " + statement)
|
|
||||||
r = t.repeat(3, 1000000)
|
|
||||||
print(r, min(r))
|
|
|
@ -1,67 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
import os
|
|
||||||
import unittest
|
|
||||||
|
|
||||||
try:
|
|
||||||
unittest.TestCase.assertEqual
|
|
||||||
except AttributeError:
|
|
||||||
unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
|
|
||||||
|
|
||||||
from .support import get_data_files, TestData, test_dir, errorMessage
|
|
||||||
from html5lib import HTMLParser, inputstream
|
|
||||||
|
|
||||||
|
|
||||||
class Html5EncodingTestCase(unittest.TestCase):
|
|
||||||
def test_codec_name_a(self):
|
|
||||||
self.assertEqual(inputstream.codecName("utf-8"), "utf-8")
|
|
||||||
|
|
||||||
def test_codec_name_b(self):
|
|
||||||
self.assertEqual(inputstream.codecName("utf8"), "utf-8")
|
|
||||||
|
|
||||||
def test_codec_name_c(self):
|
|
||||||
self.assertEqual(inputstream.codecName(" utf8 "), "utf-8")
|
|
||||||
|
|
||||||
def test_codec_name_d(self):
|
|
||||||
self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252")
|
|
||||||
|
|
||||||
|
|
||||||
def runParserEncodingTest(data, encoding):
|
|
||||||
p = HTMLParser()
|
|
||||||
assert p.documentEncoding is None
|
|
||||||
p.parse(data, useChardet=False)
|
|
||||||
encoding = encoding.lower().decode("ascii")
|
|
||||||
|
|
||||||
assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
|
|
||||||
|
|
||||||
|
|
||||||
def runPreScanEncodingTest(data, encoding):
|
|
||||||
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
|
|
||||||
encoding = encoding.lower().decode("ascii")
|
|
||||||
|
|
||||||
# Very crude way to ignore irrelevant tests
|
|
||||||
if len(data) > stream.numBytesMeta:
|
|
||||||
return
|
|
||||||
|
|
||||||
assert encoding == stream.charEncoding[0], errorMessage(data, encoding, stream.charEncoding[0])
|
|
||||||
|
|
||||||
|
|
||||||
def test_encoding():
|
|
||||||
for filename in get_data_files("encoding"):
|
|
||||||
tests = TestData(filename, b"data", encoding=None)
|
|
||||||
for idx, test in enumerate(tests):
|
|
||||||
yield (runParserEncodingTest, test[b'data'], test[b'encoding'])
|
|
||||||
yield (runPreScanEncodingTest, test[b'data'], test[b'encoding'])
|
|
||||||
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
import charade # flake8: noqa
|
|
||||||
except ImportError:
|
|
||||||
import chardet # flake8: noqa
|
|
||||||
except ImportError:
|
|
||||||
print("charade/chardet not found, skipping chardet tests")
|
|
||||||
else:
|
|
||||||
def test_chardet():
|
|
||||||
with open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt"), "rb") as fp:
|
|
||||||
encoding = inputstream.HTMLInputStream(fp.read()).charEncoding
|
|
||||||
assert encoding[0].lower() == "big5"
|
|
|
@ -1,96 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import traceback
|
|
||||||
import warnings
|
|
||||||
import re
|
|
||||||
|
|
||||||
warnings.simplefilter("error")
|
|
||||||
|
|
||||||
from .support import get_data_files
|
|
||||||
from .support import TestData, convert, convertExpected, treeTypes
|
|
||||||
from html5lib import html5parser, constants
|
|
||||||
|
|
||||||
# Run the parse error checks
|
|
||||||
checkParseErrors = False
|
|
||||||
|
|
||||||
# XXX - There should just be one function here but for some reason the testcase
|
|
||||||
# format differs from the treedump format by a single space character
|
|
||||||
|
|
||||||
|
|
||||||
def convertTreeDump(data):
|
|
||||||
return "\n".join(convert(3)(data).split("\n")[1:])
|
|
||||||
|
|
||||||
namespaceExpected = re.compile(r"^(\s*)<(\S+)>", re.M).sub
|
|
||||||
|
|
||||||
|
|
||||||
def runParserTest(innerHTML, input, expected, errors, treeClass,
|
|
||||||
namespaceHTMLElements):
|
|
||||||
with warnings.catch_warnings(record=True) as caughtWarnings:
|
|
||||||
warnings.simplefilter("always")
|
|
||||||
p = html5parser.HTMLParser(tree=treeClass,
|
|
||||||
namespaceHTMLElements=namespaceHTMLElements)
|
|
||||||
|
|
||||||
try:
|
|
||||||
if innerHTML:
|
|
||||||
document = p.parseFragment(input, innerHTML)
|
|
||||||
else:
|
|
||||||
document = p.parse(input)
|
|
||||||
except:
|
|
||||||
errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected,
|
|
||||||
"\nTraceback:", traceback.format_exc()])
|
|
||||||
assert False, errorMsg
|
|
||||||
|
|
||||||
otherWarnings = [x for x in caughtWarnings
|
|
||||||
if not issubclass(x.category, constants.DataLossWarning)]
|
|
||||||
assert len(otherWarnings) == 0, [(x.category, x.message) for x in otherWarnings]
|
|
||||||
if len(caughtWarnings):
|
|
||||||
return
|
|
||||||
|
|
||||||
output = convertTreeDump(p.tree.testSerializer(document))
|
|
||||||
|
|
||||||
expected = convertExpected(expected)
|
|
||||||
if namespaceHTMLElements:
|
|
||||||
expected = namespaceExpected(r"\1<html \2>", expected)
|
|
||||||
|
|
||||||
errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected,
|
|
||||||
"\nReceived:", output])
|
|
||||||
assert expected == output, errorMsg
|
|
||||||
|
|
||||||
errStr = []
|
|
||||||
for (line, col), errorcode, datavars in p.errors:
|
|
||||||
assert isinstance(datavars, dict), "%s, %s" % (errorcode, repr(datavars))
|
|
||||||
errStr.append("Line: %i Col: %i %s" % (line, col,
|
|
||||||
constants.E[errorcode] % datavars))
|
|
||||||
|
|
||||||
errorMsg2 = "\n".join(["\n\nInput:", input,
|
|
||||||
"\nExpected errors (" + str(len(errors)) + "):\n" + "\n".join(errors),
|
|
||||||
"\nActual errors (" + str(len(p.errors)) + "):\n" + "\n".join(errStr)])
|
|
||||||
if checkParseErrors:
|
|
||||||
assert len(p.errors) == len(errors), errorMsg2
|
|
||||||
|
|
||||||
|
|
||||||
def test_parser():
|
|
||||||
sys.stderr.write('Testing tree builders ' + " ".join(list(treeTypes.keys())) + "\n")
|
|
||||||
files = get_data_files('tree-construction')
|
|
||||||
|
|
||||||
for filename in files:
|
|
||||||
testName = os.path.basename(filename).replace(".dat", "")
|
|
||||||
if testName in ("template",):
|
|
||||||
continue
|
|
||||||
|
|
||||||
tests = TestData(filename, "data")
|
|
||||||
|
|
||||||
for index, test in enumerate(tests):
|
|
||||||
input, errors, innerHTML, expected = [test[key] for key in
|
|
||||||
('data', 'errors',
|
|
||||||
'document-fragment',
|
|
||||||
'document')]
|
|
||||||
if errors:
|
|
||||||
errors = errors.split("\n")
|
|
||||||
|
|
||||||
for treeName, treeCls in treeTypes.items():
|
|
||||||
for namespaceHTMLElements in (True, False):
|
|
||||||
yield (runParserTest, innerHTML, input, expected, errors, treeCls,
|
|
||||||
namespaceHTMLElements)
|
|
|
@ -1,64 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
import io
|
|
||||||
|
|
||||||
from . import support # flake8: noqa
|
|
||||||
from html5lib import html5parser
|
|
||||||
from html5lib.constants import namespaces
|
|
||||||
from html5lib import treebuilders
|
|
||||||
|
|
||||||
import unittest
|
|
||||||
|
|
||||||
# tests that aren't autogenerated from text files
|
|
||||||
|
|
||||||
|
|
||||||
class MoreParserTests(unittest.TestCase):
|
|
||||||
|
|
||||||
def setUp(self):
|
|
||||||
self.dom_tree = treebuilders.getTreeBuilder("dom")
|
|
||||||
|
|
||||||
def test_assertDoctypeCloneable(self):
|
|
||||||
parser = html5parser.HTMLParser(tree=self.dom_tree)
|
|
||||||
doc = parser.parse('<!DOCTYPE HTML>')
|
|
||||||
self.assertTrue(doc.cloneNode(True))
|
|
||||||
|
|
||||||
def test_line_counter(self):
|
|
||||||
# http://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0
|
|
||||||
parser = html5parser.HTMLParser(tree=self.dom_tree)
|
|
||||||
parser.parse("<pre>\nx\n>\n</pre>")
|
|
||||||
|
|
||||||
def test_namespace_html_elements_0_dom(self):
|
|
||||||
parser = html5parser.HTMLParser(tree=self.dom_tree, namespaceHTMLElements=True)
|
|
||||||
doc = parser.parse("<html></html>")
|
|
||||||
self.assertTrue(doc.childNodes[0].namespaceURI == namespaces["html"])
|
|
||||||
|
|
||||||
def test_namespace_html_elements_1_dom(self):
|
|
||||||
parser = html5parser.HTMLParser(tree=self.dom_tree, namespaceHTMLElements=False)
|
|
||||||
doc = parser.parse("<html></html>")
|
|
||||||
self.assertTrue(doc.childNodes[0].namespaceURI is None)
|
|
||||||
|
|
||||||
def test_namespace_html_elements_0_etree(self):
|
|
||||||
parser = html5parser.HTMLParser(namespaceHTMLElements=True)
|
|
||||||
doc = parser.parse("<html></html>")
|
|
||||||
self.assertTrue(list(doc)[0].tag == "{%s}html" % (namespaces["html"],))
|
|
||||||
|
|
||||||
def test_namespace_html_elements_1_etree(self):
|
|
||||||
parser = html5parser.HTMLParser(namespaceHTMLElements=False)
|
|
||||||
doc = parser.parse("<html></html>")
|
|
||||||
self.assertTrue(list(doc)[0].tag == "html")
|
|
||||||
|
|
||||||
def test_unicode_file(self):
|
|
||||||
parser = html5parser.HTMLParser()
|
|
||||||
parser.parse(io.StringIO("a"))
|
|
||||||
|
|
||||||
|
|
||||||
def buildTestSuite():
|
|
||||||
return unittest.defaultTestLoader.loadTestsFromName(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
buildTestSuite()
|
|
||||||
unittest.main()
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
|
@ -1,105 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
try:
|
|
||||||
import json
|
|
||||||
except ImportError:
|
|
||||||
import simplejson as json
|
|
||||||
|
|
||||||
from html5lib import html5parser, sanitizer, constants, treebuilders
|
|
||||||
|
|
||||||
|
|
||||||
def toxmlFactory():
|
|
||||||
tree = treebuilders.getTreeBuilder("etree")
|
|
||||||
|
|
||||||
def toxml(element):
|
|
||||||
# encode/decode roundtrip required for Python 2.6 compatibility
|
|
||||||
result_bytes = tree.implementation.tostring(element, encoding="utf-8")
|
|
||||||
return result_bytes.decode("utf-8")
|
|
||||||
|
|
||||||
return toxml
|
|
||||||
|
|
||||||
|
|
||||||
def runSanitizerTest(name, expected, input, toxml=None):
|
|
||||||
if toxml is None:
|
|
||||||
toxml = toxmlFactory()
|
|
||||||
expected = ''.join([toxml(token) for token in html5parser.HTMLParser().
|
|
||||||
parseFragment(expected)])
|
|
||||||
expected = json.loads(json.dumps(expected))
|
|
||||||
assert expected == sanitize_html(input)
|
|
||||||
|
|
||||||
|
|
||||||
def sanitize_html(stream, toxml=None):
|
|
||||||
if toxml is None:
|
|
||||||
toxml = toxmlFactory()
|
|
||||||
return ''.join([toxml(token) for token in
|
|
||||||
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
|
|
||||||
parseFragment(stream)])
|
|
||||||
|
|
||||||
|
|
||||||
def test_should_handle_astral_plane_characters():
|
|
||||||
assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml">\U0001d4b5 \U0001d538</html:p>' == sanitize_html("<p>𝒵 𝔸</p>")
|
|
||||||
|
|
||||||
|
|
||||||
def test_sanitizer():
|
|
||||||
toxml = toxmlFactory()
|
|
||||||
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
|
|
||||||
if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']:
|
|
||||||
continue # TODO
|
|
||||||
if tag_name != tag_name.lower():
|
|
||||||
continue # TODO
|
|
||||||
if tag_name == 'image':
|
|
||||||
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
|
|
||||||
"<img title=\"1\"/>foo <bad>bar</bad> baz",
|
|
||||||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
|
|
||||||
toxml)
|
|
||||||
elif tag_name == 'br':
|
|
||||||
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
|
|
||||||
"<br title=\"1\"/>foo <bad>bar</bad> baz<br/>",
|
|
||||||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
|
|
||||||
toxml)
|
|
||||||
elif tag_name in constants.voidElements:
|
|
||||||
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
|
|
||||||
"<%s title=\"1\"/>foo <bad>bar</bad> baz" % tag_name,
|
|
||||||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
|
|
||||||
toxml)
|
|
||||||
else:
|
|
||||||
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
|
|
||||||
"<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
|
|
||||||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
|
|
||||||
toxml)
|
|
||||||
|
|
||||||
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
|
|
||||||
tag_name = tag_name.upper()
|
|
||||||
yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name,
|
|
||||||
"<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
|
|
||||||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
|
|
||||||
toxml)
|
|
||||||
|
|
||||||
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
|
|
||||||
if attribute_name != attribute_name.lower():
|
|
||||||
continue # TODO
|
|
||||||
if attribute_name == 'style':
|
|
||||||
continue
|
|
||||||
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
|
|
||||||
"<p %s=\"foo\">foo <bad>bar</bad> baz</p>" % attribute_name,
|
|
||||||
"<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name,
|
|
||||||
toxml)
|
|
||||||
|
|
||||||
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
|
|
||||||
attribute_name = attribute_name.upper()
|
|
||||||
yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name,
|
|
||||||
"<p>foo <bad>bar</bad> baz</p>",
|
|
||||||
"<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name,
|
|
||||||
toxml)
|
|
||||||
|
|
||||||
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
|
|
||||||
yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol,
|
|
||||||
"<a href=\"%s\">foo</a>" % protocol,
|
|
||||||
"""<a href="%s">foo</a>""" % protocol,
|
|
||||||
toxml)
|
|
||||||
|
|
||||||
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
|
|
||||||
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
|
|
||||||
"<a href=\"%s\">foo</a>" % protocol,
|
|
||||||
"""<a href="%s">foo</a>""" % protocol,
|
|
||||||
toxml)
|
|
|
@ -1,178 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
import json
|
|
||||||
import unittest
|
|
||||||
|
|
||||||
from .support import get_data_files
|
|
||||||
|
|
||||||
try:
|
|
||||||
unittest.TestCase.assertEqual
|
|
||||||
except AttributeError:
|
|
||||||
unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
|
|
||||||
|
|
||||||
import html5lib
|
|
||||||
from html5lib import constants
|
|
||||||
from html5lib.serializer import HTMLSerializer, serialize
|
|
||||||
from html5lib.treewalkers._base import TreeWalker
|
|
||||||
|
|
||||||
optionals_loaded = []
|
|
||||||
|
|
||||||
try:
|
|
||||||
from lxml import etree
|
|
||||||
optionals_loaded.append("lxml")
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
default_namespace = constants.namespaces["html"]
|
|
||||||
|
|
||||||
|
|
||||||
class JsonWalker(TreeWalker):
|
|
||||||
def __iter__(self):
|
|
||||||
for token in self.tree:
|
|
||||||
type = token[0]
|
|
||||||
if type == "StartTag":
|
|
||||||
if len(token) == 4:
|
|
||||||
namespace, name, attrib = token[1:4]
|
|
||||||
else:
|
|
||||||
namespace = default_namespace
|
|
||||||
name, attrib = token[1:3]
|
|
||||||
yield self.startTag(namespace, name, self._convertAttrib(attrib))
|
|
||||||
elif type == "EndTag":
|
|
||||||
if len(token) == 3:
|
|
||||||
namespace, name = token[1:3]
|
|
||||||
else:
|
|
||||||
namespace = default_namespace
|
|
||||||
name = token[1]
|
|
||||||
yield self.endTag(namespace, name)
|
|
||||||
elif type == "EmptyTag":
|
|
||||||
if len(token) == 4:
|
|
||||||
namespace, name, attrib = token[1:]
|
|
||||||
else:
|
|
||||||
namespace = default_namespace
|
|
||||||
name, attrib = token[1:]
|
|
||||||
for token in self.emptyTag(namespace, name, self._convertAttrib(attrib)):
|
|
||||||
yield token
|
|
||||||
elif type == "Comment":
|
|
||||||
yield self.comment(token[1])
|
|
||||||
elif type in ("Characters", "SpaceCharacters"):
|
|
||||||
for token in self.text(token[1]):
|
|
||||||
yield token
|
|
||||||
elif type == "Doctype":
|
|
||||||
if len(token) == 4:
|
|
||||||
yield self.doctype(token[1], token[2], token[3])
|
|
||||||
elif len(token) == 3:
|
|
||||||
yield self.doctype(token[1], token[2])
|
|
||||||
else:
|
|
||||||
yield self.doctype(token[1])
|
|
||||||
else:
|
|
||||||
raise ValueError("Unknown token type: " + type)
|
|
||||||
|
|
||||||
def _convertAttrib(self, attribs):
|
|
||||||
"""html5lib tree-walkers use a dict of (namespace, name): value for
|
|
||||||
attributes, but JSON cannot represent this. Convert from the format
|
|
||||||
in the serializer tests (a list of dicts with "namespace", "name",
|
|
||||||
and "value" as keys) to html5lib's tree-walker format."""
|
|
||||||
attrs = {}
|
|
||||||
for attrib in attribs:
|
|
||||||
name = (attrib["namespace"], attrib["name"])
|
|
||||||
assert(name not in attrs)
|
|
||||||
attrs[name] = attrib["value"]
|
|
||||||
return attrs
|
|
||||||
|
|
||||||
|
|
||||||
def serialize_html(input, options):
|
|
||||||
options = dict([(str(k), v) for k, v in options.items()])
|
|
||||||
stream = JsonWalker(input)
|
|
||||||
serializer = HTMLSerializer(alphabetical_attributes=True, **options)
|
|
||||||
return serializer.render(stream, options.get("encoding", None))
|
|
||||||
|
|
||||||
|
|
||||||
def runSerializerTest(input, expected, options):
|
|
||||||
encoding = options.get("encoding", None)
|
|
||||||
|
|
||||||
if encoding:
|
|
||||||
encode = lambda x: x.encode(encoding)
|
|
||||||
expected = list(map(encode, expected))
|
|
||||||
|
|
||||||
result = serialize_html(input, options)
|
|
||||||
if len(expected) == 1:
|
|
||||||
assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions:\n%s" % (expected[0], result, str(options))
|
|
||||||
elif result not in expected:
|
|
||||||
assert False, "Expected: %s, Received: %s" % (expected, result)
|
|
||||||
|
|
||||||
|
|
||||||
class EncodingTestCase(unittest.TestCase):
|
|
||||||
def throwsWithLatin1(self, input):
|
|
||||||
self.assertRaises(UnicodeEncodeError, serialize_html, input, {"encoding": "iso-8859-1"})
|
|
||||||
|
|
||||||
def testDoctypeName(self):
|
|
||||||
self.throwsWithLatin1([["Doctype", "\u0101"]])
|
|
||||||
|
|
||||||
def testDoctypePublicId(self):
|
|
||||||
self.throwsWithLatin1([["Doctype", "potato", "\u0101"]])
|
|
||||||
|
|
||||||
def testDoctypeSystemId(self):
|
|
||||||
self.throwsWithLatin1([["Doctype", "potato", "potato", "\u0101"]])
|
|
||||||
|
|
||||||
def testCdataCharacters(self):
|
|
||||||
runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\u0101"]],
|
|
||||||
["<style>ā"], {"encoding": "iso-8859-1"})
|
|
||||||
|
|
||||||
def testCharacters(self):
|
|
||||||
runSerializerTest([["Characters", "\u0101"]],
|
|
||||||
["ā"], {"encoding": "iso-8859-1"})
|
|
||||||
|
|
||||||
def testStartTagName(self):
|
|
||||||
self.throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "\u0101", []]])
|
|
||||||
|
|
||||||
def testEmptyTagName(self):
|
|
||||||
self.throwsWithLatin1([["EmptyTag", "http://www.w3.org/1999/xhtml", "\u0101", []]])
|
|
||||||
|
|
||||||
def testAttributeName(self):
|
|
||||||
self.throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": None, "name": "\u0101", "value": "potato"}]]])
|
|
||||||
|
|
||||||
def testAttributeValue(self):
|
|
||||||
runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "span",
|
|
||||||
[{"namespace": None, "name": "potato", "value": "\u0101"}]]],
|
|
||||||
["<span potato=ā>"], {"encoding": "iso-8859-1"})
|
|
||||||
|
|
||||||
def testEndTagName(self):
|
|
||||||
self.throwsWithLatin1([["EndTag", "http://www.w3.org/1999/xhtml", "\u0101"]])
|
|
||||||
|
|
||||||
def testComment(self):
|
|
||||||
self.throwsWithLatin1([["Comment", "\u0101"]])
|
|
||||||
|
|
||||||
|
|
||||||
if "lxml" in optionals_loaded:
|
|
||||||
class LxmlTestCase(unittest.TestCase):
|
|
||||||
def setUp(self):
|
|
||||||
self.parser = etree.XMLParser(resolve_entities=False)
|
|
||||||
self.treewalker = html5lib.getTreeWalker("lxml")
|
|
||||||
self.serializer = HTMLSerializer()
|
|
||||||
|
|
||||||
def testEntityReplacement(self):
|
|
||||||
doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>β</html>"""
|
|
||||||
tree = etree.fromstring(doc, parser=self.parser).getroottree()
|
|
||||||
result = serialize(tree, tree="lxml", omit_optional_tags=False)
|
|
||||||
self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>""", result)
|
|
||||||
|
|
||||||
def testEntityXML(self):
|
|
||||||
doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>></html>"""
|
|
||||||
tree = etree.fromstring(doc, parser=self.parser).getroottree()
|
|
||||||
result = serialize(tree, tree="lxml", omit_optional_tags=False)
|
|
||||||
self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>></html>""", result)
|
|
||||||
|
|
||||||
def testEntityNoResolve(self):
|
|
||||||
doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>β</html>"""
|
|
||||||
tree = etree.fromstring(doc, parser=self.parser).getroottree()
|
|
||||||
result = serialize(tree, tree="lxml", omit_optional_tags=False,
|
|
||||||
resolve_entities=False)
|
|
||||||
self.assertEqual("""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>β</html>""", result)
|
|
||||||
|
|
||||||
|
|
||||||
def test_serializer():
|
|
||||||
for filename in get_data_files('serializer', '*.test'):
|
|
||||||
with open(filename) as fp:
|
|
||||||
tests = json.load(fp)
|
|
||||||
for index, test in enumerate(tests['tests']):
|
|
||||||
yield runSerializerTest, test["input"], test["expected"], test.get("options", {})
|
|
|
@ -1,183 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from . import support # flake8: noqa
|
|
||||||
import unittest
|
|
||||||
import codecs
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
from six.moves import http_client
|
|
||||||
|
|
||||||
from html5lib.inputstream import (BufferedStream, HTMLInputStream,
|
|
||||||
HTMLUnicodeInputStream, HTMLBinaryInputStream)
|
|
||||||
|
|
||||||
class BufferedStreamTest(unittest.TestCase):
|
|
||||||
def test_basic(self):
|
|
||||||
s = b"abc"
|
|
||||||
fp = BufferedStream(BytesIO(s))
|
|
||||||
read = fp.read(10)
|
|
||||||
assert read == s
|
|
||||||
|
|
||||||
def test_read_length(self):
|
|
||||||
fp = BufferedStream(BytesIO(b"abcdef"))
|
|
||||||
read1 = fp.read(1)
|
|
||||||
assert read1 == b"a"
|
|
||||||
read2 = fp.read(2)
|
|
||||||
assert read2 == b"bc"
|
|
||||||
read3 = fp.read(3)
|
|
||||||
assert read3 == b"def"
|
|
||||||
read4 = fp.read(4)
|
|
||||||
assert read4 == b""
|
|
||||||
|
|
||||||
def test_tell(self):
|
|
||||||
fp = BufferedStream(BytesIO(b"abcdef"))
|
|
||||||
read1 = fp.read(1)
|
|
||||||
assert fp.tell() == 1
|
|
||||||
read2 = fp.read(2)
|
|
||||||
assert fp.tell() == 3
|
|
||||||
read3 = fp.read(3)
|
|
||||||
assert fp.tell() == 6
|
|
||||||
read4 = fp.read(4)
|
|
||||||
assert fp.tell() == 6
|
|
||||||
|
|
||||||
def test_seek(self):
|
|
||||||
fp = BufferedStream(BytesIO(b"abcdef"))
|
|
||||||
read1 = fp.read(1)
|
|
||||||
assert read1 == b"a"
|
|
||||||
fp.seek(0)
|
|
||||||
read2 = fp.read(1)
|
|
||||||
assert read2 == b"a"
|
|
||||||
read3 = fp.read(2)
|
|
||||||
assert read3 == b"bc"
|
|
||||||
fp.seek(2)
|
|
||||||
read4 = fp.read(2)
|
|
||||||
assert read4 == b"cd"
|
|
||||||
fp.seek(4)
|
|
||||||
read5 = fp.read(2)
|
|
||||||
assert read5 == b"ef"
|
|
||||||
|
|
||||||
def test_seek_tell(self):
|
|
||||||
fp = BufferedStream(BytesIO(b"abcdef"))
|
|
||||||
read1 = fp.read(1)
|
|
||||||
assert fp.tell() == 1
|
|
||||||
fp.seek(0)
|
|
||||||
read2 = fp.read(1)
|
|
||||||
assert fp.tell() == 1
|
|
||||||
read3 = fp.read(2)
|
|
||||||
assert fp.tell() == 3
|
|
||||||
fp.seek(2)
|
|
||||||
read4 = fp.read(2)
|
|
||||||
assert fp.tell() == 4
|
|
||||||
fp.seek(4)
|
|
||||||
read5 = fp.read(2)
|
|
||||||
assert fp.tell() == 6
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLUnicodeInputStreamShortChunk(HTMLUnicodeInputStream):
|
|
||||||
_defaultChunkSize = 2
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLBinaryInputStreamShortChunk(HTMLBinaryInputStream):
|
|
||||||
_defaultChunkSize = 2
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLInputStreamTest(unittest.TestCase):
|
|
||||||
|
|
||||||
def test_char_ascii(self):
|
|
||||||
stream = HTMLInputStream(b"'", encoding='ascii')
|
|
||||||
self.assertEqual(stream.charEncoding[0], 'ascii')
|
|
||||||
self.assertEqual(stream.char(), "'")
|
|
||||||
|
|
||||||
def test_char_utf8(self):
|
|
||||||
stream = HTMLInputStream('\u2018'.encode('utf-8'), encoding='utf-8')
|
|
||||||
self.assertEqual(stream.charEncoding[0], 'utf-8')
|
|
||||||
self.assertEqual(stream.char(), '\u2018')
|
|
||||||
|
|
||||||
def test_char_win1252(self):
|
|
||||||
stream = HTMLInputStream("\xa9\xf1\u2019".encode('windows-1252'))
|
|
||||||
self.assertEqual(stream.charEncoding[0], 'windows-1252')
|
|
||||||
self.assertEqual(stream.char(), "\xa9")
|
|
||||||
self.assertEqual(stream.char(), "\xf1")
|
|
||||||
self.assertEqual(stream.char(), "\u2019")
|
|
||||||
|
|
||||||
def test_bom(self):
|
|
||||||
stream = HTMLInputStream(codecs.BOM_UTF8 + b"'")
|
|
||||||
self.assertEqual(stream.charEncoding[0], 'utf-8')
|
|
||||||
self.assertEqual(stream.char(), "'")
|
|
||||||
|
|
||||||
def test_utf_16(self):
|
|
||||||
stream = HTMLInputStream((' ' * 1025).encode('utf-16'))
|
|
||||||
self.assertTrue(stream.charEncoding[0] in ['utf-16-le', 'utf-16-be'], stream.charEncoding)
|
|
||||||
self.assertEqual(len(stream.charsUntil(' ', True)), 1025)
|
|
||||||
|
|
||||||
def test_newlines(self):
|
|
||||||
stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\r\nccc\rddddxe")
|
|
||||||
self.assertEqual(stream.position(), (1, 0))
|
|
||||||
self.assertEqual(stream.charsUntil('c'), "a\nbb\n")
|
|
||||||
self.assertEqual(stream.position(), (3, 0))
|
|
||||||
self.assertEqual(stream.charsUntil('x'), "ccc\ndddd")
|
|
||||||
self.assertEqual(stream.position(), (4, 4))
|
|
||||||
self.assertEqual(stream.charsUntil('e'), "x")
|
|
||||||
self.assertEqual(stream.position(), (4, 5))
|
|
||||||
|
|
||||||
def test_newlines2(self):
|
|
||||||
size = HTMLUnicodeInputStream._defaultChunkSize
|
|
||||||
stream = HTMLInputStream("\r" * size + "\n")
|
|
||||||
self.assertEqual(stream.charsUntil('x'), "\n" * size)
|
|
||||||
|
|
||||||
def test_position(self):
|
|
||||||
stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\nccc\nddde\nf\ngh")
|
|
||||||
self.assertEqual(stream.position(), (1, 0))
|
|
||||||
self.assertEqual(stream.charsUntil('c'), "a\nbb\n")
|
|
||||||
self.assertEqual(stream.position(), (3, 0))
|
|
||||||
stream.unget("\n")
|
|
||||||
self.assertEqual(stream.position(), (2, 2))
|
|
||||||
self.assertEqual(stream.charsUntil('c'), "\n")
|
|
||||||
self.assertEqual(stream.position(), (3, 0))
|
|
||||||
stream.unget("\n")
|
|
||||||
self.assertEqual(stream.position(), (2, 2))
|
|
||||||
self.assertEqual(stream.char(), "\n")
|
|
||||||
self.assertEqual(stream.position(), (3, 0))
|
|
||||||
self.assertEqual(stream.charsUntil('e'), "ccc\nddd")
|
|
||||||
self.assertEqual(stream.position(), (4, 3))
|
|
||||||
self.assertEqual(stream.charsUntil('h'), "e\nf\ng")
|
|
||||||
self.assertEqual(stream.position(), (6, 1))
|
|
||||||
|
|
||||||
def test_position2(self):
|
|
||||||
stream = HTMLUnicodeInputStreamShortChunk("abc\nd")
|
|
||||||
self.assertEqual(stream.position(), (1, 0))
|
|
||||||
self.assertEqual(stream.char(), "a")
|
|
||||||
self.assertEqual(stream.position(), (1, 1))
|
|
||||||
self.assertEqual(stream.char(), "b")
|
|
||||||
self.assertEqual(stream.position(), (1, 2))
|
|
||||||
self.assertEqual(stream.char(), "c")
|
|
||||||
self.assertEqual(stream.position(), (1, 3))
|
|
||||||
self.assertEqual(stream.char(), "\n")
|
|
||||||
self.assertEqual(stream.position(), (2, 0))
|
|
||||||
self.assertEqual(stream.char(), "d")
|
|
||||||
self.assertEqual(stream.position(), (2, 1))
|
|
||||||
|
|
||||||
def test_python_issue_20007(self):
|
|
||||||
"""
|
|
||||||
Make sure we have a work-around for Python bug #20007
|
|
||||||
http://bugs.python.org/issue20007
|
|
||||||
"""
|
|
||||||
class FakeSocket(object):
|
|
||||||
def makefile(self, _mode, _bufsize=None):
|
|
||||||
return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
|
|
||||||
|
|
||||||
source = http_client.HTTPResponse(FakeSocket())
|
|
||||||
source.begin()
|
|
||||||
stream = HTMLInputStream(source)
|
|
||||||
self.assertEqual(stream.charsUntil(" "), "Text")
|
|
||||||
|
|
||||||
|
|
||||||
def buildTestSuite():
|
|
||||||
return unittest.defaultTestLoader.loadTestsFromName(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
buildTestSuite()
|
|
||||||
unittest.main()
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
|
@ -1,353 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import unittest
|
|
||||||
import warnings
|
|
||||||
from difflib import unified_diff
|
|
||||||
|
|
||||||
try:
|
|
||||||
unittest.TestCase.assertEqual
|
|
||||||
except AttributeError:
|
|
||||||
unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
|
|
||||||
|
|
||||||
from .support import get_data_files, TestData, convertExpected
|
|
||||||
|
|
||||||
from html5lib import html5parser, treewalkers, treebuilders, constants
|
|
||||||
|
|
||||||
|
|
||||||
def PullDOMAdapter(node):
|
|
||||||
from xml.dom import Node
|
|
||||||
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, COMMENT, CHARACTERS
|
|
||||||
|
|
||||||
if node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
|
|
||||||
for childNode in node.childNodes:
|
|
||||||
for event in PullDOMAdapter(childNode):
|
|
||||||
yield event
|
|
||||||
|
|
||||||
elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
|
||||||
raise NotImplementedError("DOCTYPE nodes are not supported by PullDOM")
|
|
||||||
|
|
||||||
elif node.nodeType == Node.COMMENT_NODE:
|
|
||||||
yield COMMENT, node
|
|
||||||
|
|
||||||
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
|
|
||||||
yield CHARACTERS, node
|
|
||||||
|
|
||||||
elif node.nodeType == Node.ELEMENT_NODE:
|
|
||||||
yield START_ELEMENT, node
|
|
||||||
for childNode in node.childNodes:
|
|
||||||
for event in PullDOMAdapter(childNode):
|
|
||||||
yield event
|
|
||||||
yield END_ELEMENT, node
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise NotImplementedError("Node type not supported: " + str(node.nodeType))
|
|
||||||
|
|
||||||
treeTypes = {
|
|
||||||
"DOM": {"builder": treebuilders.getTreeBuilder("dom"),
|
|
||||||
"walker": treewalkers.getTreeWalker("dom")},
|
|
||||||
"PullDOM": {"builder": treebuilders.getTreeBuilder("dom"),
|
|
||||||
"adapter": PullDOMAdapter,
|
|
||||||
"walker": treewalkers.getTreeWalker("pulldom")},
|
|
||||||
}
|
|
||||||
|
|
||||||
# Try whatever etree implementations are available from a list that are
|
|
||||||
#"supposed" to work
|
|
||||||
try:
|
|
||||||
import xml.etree.ElementTree as ElementTree
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
treeTypes['ElementTree'] = \
|
|
||||||
{"builder": treebuilders.getTreeBuilder("etree", ElementTree),
|
|
||||||
"walker": treewalkers.getTreeWalker("etree", ElementTree)}
|
|
||||||
|
|
||||||
try:
|
|
||||||
import xml.etree.cElementTree as ElementTree
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
treeTypes['cElementTree'] = \
|
|
||||||
{"builder": treebuilders.getTreeBuilder("etree", ElementTree),
|
|
||||||
"walker": treewalkers.getTreeWalker("etree", ElementTree)}
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
import lxml.etree as ElementTree # flake8: noqa
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
treeTypes['lxml_native'] = \
|
|
||||||
{"builder": treebuilders.getTreeBuilder("lxml"),
|
|
||||||
"walker": treewalkers.getTreeWalker("lxml")}
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
from genshi.core import QName, Attrs
|
|
||||||
from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
def GenshiAdapter(tree):
|
|
||||||
text = None
|
|
||||||
for token in treewalkers.getTreeWalker("dom")(tree):
|
|
||||||
type = token["type"]
|
|
||||||
if type in ("Characters", "SpaceCharacters"):
|
|
||||||
if text is None:
|
|
||||||
text = token["data"]
|
|
||||||
else:
|
|
||||||
text += token["data"]
|
|
||||||
elif text is not None:
|
|
||||||
yield TEXT, text, (None, -1, -1)
|
|
||||||
text = None
|
|
||||||
|
|
||||||
if type in ("StartTag", "EmptyTag"):
|
|
||||||
if token["namespace"]:
|
|
||||||
name = "{%s}%s" % (token["namespace"], token["name"])
|
|
||||||
else:
|
|
||||||
name = token["name"]
|
|
||||||
attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value)
|
|
||||||
for attr, value in token["data"].items()])
|
|
||||||
yield (START, (QName(name), attrs), (None, -1, -1))
|
|
||||||
if type == "EmptyTag":
|
|
||||||
type = "EndTag"
|
|
||||||
|
|
||||||
if type == "EndTag":
|
|
||||||
if token["namespace"]:
|
|
||||||
name = "{%s}%s" % (token["namespace"], token["name"])
|
|
||||||
else:
|
|
||||||
name = token["name"]
|
|
||||||
|
|
||||||
yield END, QName(name), (None, -1, -1)
|
|
||||||
|
|
||||||
elif type == "Comment":
|
|
||||||
yield COMMENT, token["data"], (None, -1, -1)
|
|
||||||
|
|
||||||
elif type == "Doctype":
|
|
||||||
yield DOCTYPE, (token["name"], token["publicId"],
|
|
||||||
token["systemId"]), (None, -1, -1)
|
|
||||||
|
|
||||||
else:
|
|
||||||
pass # FIXME: What to do?
|
|
||||||
|
|
||||||
if text is not None:
|
|
||||||
yield TEXT, text, (None, -1, -1)
|
|
||||||
|
|
||||||
treeTypes["genshi"] = \
|
|
||||||
{"builder": treebuilders.getTreeBuilder("dom"),
|
|
||||||
"adapter": GenshiAdapter,
|
|
||||||
"walker": treewalkers.getTreeWalker("genshi")}
|
|
||||||
|
|
||||||
|
|
||||||
def concatenateCharacterTokens(tokens):
|
|
||||||
charactersToken = None
|
|
||||||
for token in tokens:
|
|
||||||
type = token["type"]
|
|
||||||
if type in ("Characters", "SpaceCharacters"):
|
|
||||||
if charactersToken is None:
|
|
||||||
charactersToken = {"type": "Characters", "data": token["data"]}
|
|
||||||
else:
|
|
||||||
charactersToken["data"] += token["data"]
|
|
||||||
else:
|
|
||||||
if charactersToken is not None:
|
|
||||||
yield charactersToken
|
|
||||||
charactersToken = None
|
|
||||||
yield token
|
|
||||||
if charactersToken is not None:
|
|
||||||
yield charactersToken
|
|
||||||
|
|
||||||
|
|
||||||
def convertTokens(tokens):
|
|
||||||
output = []
|
|
||||||
indent = 0
|
|
||||||
for token in concatenateCharacterTokens(tokens):
|
|
||||||
type = token["type"]
|
|
||||||
if type in ("StartTag", "EmptyTag"):
|
|
||||||
if (token["namespace"] and
|
|
||||||
token["namespace"] != constants.namespaces["html"]):
|
|
||||||
if token["namespace"] in constants.prefixes:
|
|
||||||
name = constants.prefixes[token["namespace"]]
|
|
||||||
else:
|
|
||||||
name = token["namespace"]
|
|
||||||
name += " " + token["name"]
|
|
||||||
else:
|
|
||||||
name = token["name"]
|
|
||||||
output.append("%s<%s>" % (" " * indent, name))
|
|
||||||
indent += 2
|
|
||||||
attrs = token["data"]
|
|
||||||
if attrs:
|
|
||||||
# TODO: Remove this if statement, attrs should always exist
|
|
||||||
for (namespace, name), value in sorted(attrs.items()):
|
|
||||||
if namespace:
|
|
||||||
if namespace in constants.prefixes:
|
|
||||||
outputname = constants.prefixes[namespace]
|
|
||||||
else:
|
|
||||||
outputname = namespace
|
|
||||||
outputname += " " + name
|
|
||||||
else:
|
|
||||||
outputname = name
|
|
||||||
output.append("%s%s=\"%s\"" % (" " * indent, outputname, value))
|
|
||||||
if type == "EmptyTag":
|
|
||||||
indent -= 2
|
|
||||||
elif type == "EndTag":
|
|
||||||
indent -= 2
|
|
||||||
elif type == "Comment":
|
|
||||||
output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
|
|
||||||
elif type == "Doctype":
|
|
||||||
if token["name"]:
|
|
||||||
if token["publicId"]:
|
|
||||||
output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
|
|
||||||
(" " * indent, token["name"],
|
|
||||||
token["publicId"],
|
|
||||||
token["systemId"] and token["systemId"] or ""))
|
|
||||||
elif token["systemId"]:
|
|
||||||
output.append("""%s<!DOCTYPE %s "" "%s">""" %
|
|
||||||
(" " * indent, token["name"],
|
|
||||||
token["systemId"]))
|
|
||||||
else:
|
|
||||||
output.append("%s<!DOCTYPE %s>" % (" " * indent,
|
|
||||||
token["name"]))
|
|
||||||
else:
|
|
||||||
output.append("%s<!DOCTYPE >" % (" " * indent,))
|
|
||||||
elif type in ("Characters", "SpaceCharacters"):
|
|
||||||
output.append("%s\"%s\"" % (" " * indent, token["data"]))
|
|
||||||
else:
|
|
||||||
pass # TODO: what to do with errors?
|
|
||||||
return "\n".join(output)
|
|
||||||
|
|
||||||
import re
|
|
||||||
attrlist = re.compile(r"^(\s+)\w+=.*(\n\1\w+=.*)+", re.M)
|
|
||||||
|
|
||||||
|
|
||||||
def sortattrs(x):
|
|
||||||
lines = x.group(0).split("\n")
|
|
||||||
lines.sort()
|
|
||||||
return "\n".join(lines)
|
|
||||||
|
|
||||||
|
|
||||||
class TokenTestCase(unittest.TestCase):
|
|
||||||
def test_all_tokens(self):
|
|
||||||
expected = [
|
|
||||||
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'},
|
|
||||||
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
|
|
||||||
{'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
|
|
||||||
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
|
|
||||||
{'data': 'a', 'type': 'Characters'},
|
|
||||||
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
|
|
||||||
{'data': 'b', 'type': 'Characters'},
|
|
||||||
{'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
|
|
||||||
{'data': 'c', 'type': 'Characters'},
|
|
||||||
{'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
|
|
||||||
{'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
|
|
||||||
]
|
|
||||||
for treeName, treeCls in treeTypes.items():
|
|
||||||
p = html5parser.HTMLParser(tree=treeCls["builder"])
|
|
||||||
document = p.parse("<html><head></head><body>a<div>b</div>c</body></html>")
|
|
||||||
document = treeCls.get("adapter", lambda x: x)(document)
|
|
||||||
output = treeCls["walker"](document)
|
|
||||||
for expectedToken, outputToken in zip(expected, output):
|
|
||||||
self.assertEqual(expectedToken, outputToken)
|
|
||||||
|
|
||||||
|
|
||||||
def runTreewalkerTest(innerHTML, input, expected, errors, treeClass):
|
|
||||||
warnings.resetwarnings()
|
|
||||||
warnings.simplefilter("error")
|
|
||||||
try:
|
|
||||||
p = html5parser.HTMLParser(tree=treeClass["builder"])
|
|
||||||
if innerHTML:
|
|
||||||
document = p.parseFragment(input, innerHTML)
|
|
||||||
else:
|
|
||||||
document = p.parse(input)
|
|
||||||
except constants.DataLossWarning:
|
|
||||||
# Ignore testcases we know we don't pass
|
|
||||||
return
|
|
||||||
|
|
||||||
document = treeClass.get("adapter", lambda x: x)(document)
|
|
||||||
try:
|
|
||||||
output = convertTokens(treeClass["walker"](document))
|
|
||||||
output = attrlist.sub(sortattrs, output)
|
|
||||||
expected = attrlist.sub(sortattrs, convertExpected(expected))
|
|
||||||
diff = "".join(unified_diff([line + "\n" for line in expected.splitlines()],
|
|
||||||
[line + "\n" for line in output.splitlines()],
|
|
||||||
"Expected", "Received"))
|
|
||||||
assert expected == output, "\n".join([
|
|
||||||
"", "Input:", input,
|
|
||||||
"", "Expected:", expected,
|
|
||||||
"", "Received:", output,
|
|
||||||
"", "Diff:", diff,
|
|
||||||
])
|
|
||||||
except NotImplementedError:
|
|
||||||
pass # Amnesty for those that confess...
|
|
||||||
|
|
||||||
|
|
||||||
def test_treewalker():
|
|
||||||
sys.stdout.write('Testing tree walkers ' + " ".join(list(treeTypes.keys())) + "\n")
|
|
||||||
|
|
||||||
for treeName, treeCls in treeTypes.items():
|
|
||||||
files = get_data_files('tree-construction')
|
|
||||||
for filename in files:
|
|
||||||
testName = os.path.basename(filename).replace(".dat", "")
|
|
||||||
if testName in ("template",):
|
|
||||||
continue
|
|
||||||
|
|
||||||
tests = TestData(filename, "data")
|
|
||||||
|
|
||||||
for index, test in enumerate(tests):
|
|
||||||
(input, errors,
|
|
||||||
innerHTML, expected) = [test[key] for key in ("data", "errors",
|
|
||||||
"document-fragment",
|
|
||||||
"document")]
|
|
||||||
errors = errors.split("\n")
|
|
||||||
yield runTreewalkerTest, innerHTML, input, expected, errors, treeCls
|
|
||||||
|
|
||||||
|
|
||||||
def set_attribute_on_first_child(docfrag, name, value, treeName):
|
|
||||||
"""naively sets an attribute on the first child of the document
|
|
||||||
fragment passed in"""
|
|
||||||
setter = {'ElementTree': lambda d: d[0].set,
|
|
||||||
'DOM': lambda d: d.firstChild.setAttribute}
|
|
||||||
setter['cElementTree'] = setter['ElementTree']
|
|
||||||
try:
|
|
||||||
setter.get(treeName, setter['DOM'])(docfrag)(name, value)
|
|
||||||
except AttributeError:
|
|
||||||
setter['ElementTree'](docfrag)(name, value)
|
|
||||||
|
|
||||||
|
|
||||||
def runTreewalkerEditTest(intext, expected, attrs_to_add, tree):
|
|
||||||
"""tests what happens when we add attributes to the intext"""
|
|
||||||
treeName, treeClass = tree
|
|
||||||
parser = html5parser.HTMLParser(tree=treeClass["builder"])
|
|
||||||
document = parser.parseFragment(intext)
|
|
||||||
for nom, val in attrs_to_add:
|
|
||||||
set_attribute_on_first_child(document, nom, val, treeName)
|
|
||||||
|
|
||||||
document = treeClass.get("adapter", lambda x: x)(document)
|
|
||||||
output = convertTokens(treeClass["walker"](document))
|
|
||||||
output = attrlist.sub(sortattrs, output)
|
|
||||||
if not output in expected:
|
|
||||||
raise AssertionError("TreewalkerEditTest: %s\nExpected:\n%s\nReceived:\n%s" % (treeName, expected, output))
|
|
||||||
|
|
||||||
|
|
||||||
def test_treewalker_six_mix():
|
|
||||||
"""Str/Unicode mix. If str attrs added to tree"""
|
|
||||||
|
|
||||||
# On Python 2.x string literals are of type str. Unless, like this
|
|
||||||
# file, the programmer imports unicode_literals from __future__.
|
|
||||||
# In that case, string literals become objects of type unicode.
|
|
||||||
|
|
||||||
# This test simulates a Py2 user, modifying attributes on a document
|
|
||||||
# fragment but not using the u'' syntax nor importing unicode_literals
|
|
||||||
sm_tests = [
|
|
||||||
('<a href="http://example.com">Example</a>',
|
|
||||||
[(str('class'), str('test123'))],
|
|
||||||
'<a>\n class="test123"\n href="http://example.com"\n "Example"'),
|
|
||||||
|
|
||||||
('<link href="http://example.com/cow">',
|
|
||||||
[(str('rel'), str('alternate'))],
|
|
||||||
'<link>\n href="http://example.com/cow"\n rel="alternate"\n "Example"')
|
|
||||||
]
|
|
||||||
|
|
||||||
for tree in treeTypes.items():
|
|
||||||
for intext, attrs, expected in sm_tests:
|
|
||||||
yield runTreewalkerEditTest, intext, expected, attrs, tree
|
|
|
@ -1,133 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
import unittest
|
|
||||||
|
|
||||||
from html5lib.filters.whitespace import Filter
|
|
||||||
from html5lib.constants import spaceCharacters
|
|
||||||
spaceCharacters = "".join(spaceCharacters)
|
|
||||||
|
|
||||||
try:
|
|
||||||
unittest.TestCase.assertEqual
|
|
||||||
except AttributeError:
|
|
||||||
unittest.TestCase.assertEqual = unittest.TestCase.assertEquals
|
|
||||||
|
|
||||||
|
|
||||||
class TestCase(unittest.TestCase):
|
|
||||||
def runTest(self, input, expected):
|
|
||||||
output = list(Filter(input))
|
|
||||||
errorMsg = "\n".join(["\n\nInput:", str(input),
|
|
||||||
"\nExpected:", str(expected),
|
|
||||||
"\nReceived:", str(output)])
|
|
||||||
self.assertEqual(output, expected, errorMsg)
|
|
||||||
|
|
||||||
def runTestUnmodifiedOutput(self, input):
|
|
||||||
self.runTest(input, input)
|
|
||||||
|
|
||||||
def testPhrasingElements(self):
|
|
||||||
self.runTestUnmodifiedOutput(
|
|
||||||
[{"type": "Characters", "data": "This is a "},
|
|
||||||
{"type": "StartTag", "name": "span", "data": []},
|
|
||||||
{"type": "Characters", "data": "phrase"},
|
|
||||||
{"type": "EndTag", "name": "span", "data": []},
|
|
||||||
{"type": "SpaceCharacters", "data": " "},
|
|
||||||
{"type": "Characters", "data": "with"},
|
|
||||||
{"type": "SpaceCharacters", "data": " "},
|
|
||||||
{"type": "StartTag", "name": "em", "data": []},
|
|
||||||
{"type": "Characters", "data": "emphasised text"},
|
|
||||||
{"type": "EndTag", "name": "em", "data": []},
|
|
||||||
{"type": "Characters", "data": " and an "},
|
|
||||||
{"type": "StartTag", "name": "img", "data": [["alt", "image"]]},
|
|
||||||
{"type": "Characters", "data": "."}])
|
|
||||||
|
|
||||||
def testLeadingWhitespace(self):
|
|
||||||
self.runTest(
|
|
||||||
[{"type": "StartTag", "name": "p", "data": []},
|
|
||||||
{"type": "SpaceCharacters", "data": spaceCharacters},
|
|
||||||
{"type": "Characters", "data": "foo"},
|
|
||||||
{"type": "EndTag", "name": "p", "data": []}],
|
|
||||||
[{"type": "StartTag", "name": "p", "data": []},
|
|
||||||
{"type": "SpaceCharacters", "data": " "},
|
|
||||||
{"type": "Characters", "data": "foo"},
|
|
||||||
{"type": "EndTag", "name": "p", "data": []}])
|
|
||||||
|
|
||||||
def testLeadingWhitespaceAsCharacters(self):
|
|
||||||
self.runTest(
|
|
||||||
[{"type": "StartTag", "name": "p", "data": []},
|
|
||||||
{"type": "Characters", "data": spaceCharacters + "foo"},
|
|
||||||
{"type": "EndTag", "name": "p", "data": []}],
|
|
||||||
[{"type": "StartTag", "name": "p", "data": []},
|
|
||||||
{"type": "Characters", "data": " foo"},
|
|
||||||
{"type": "EndTag", "name": "p", "data": []}])
|
|
||||||
|
|
||||||
def testTrailingWhitespace(self):
|
|
||||||
self.runTest(
|
|
||||||
[{"type": "StartTag", "name": "p", "data": []},
|
|
||||||
{"type": "Characters", "data": "foo"},
|
|
||||||
{"type": "SpaceCharacters", "data": spaceCharacters},
|
|
||||||
{"type": "EndTag", "name": "p", "data": []}],
|
|
||||||
[{"type": "StartTag", "name": "p", "data": []},
|
|
||||||
{"type": "Characters", "data": "foo"},
|
|
||||||
{"type": "SpaceCharacters", "data": " "},
|
|
||||||
{"type": "EndTag", "name": "p", "data": []}])
|
|
||||||
|
|
||||||
def testTrailingWhitespaceAsCharacters(self):
|
|
||||||
self.runTest(
|
|
||||||
[{"type": "StartTag", "name": "p", "data": []},
|
|
||||||
{"type": "Characters", "data": "foo" + spaceCharacters},
|
|
||||||
{"type": "EndTag", "name": "p", "data": []}],
|
|
||||||
[{"type": "StartTag", "name": "p", "data": []},
|
|
||||||
{"type": "Characters", "data": "foo "},
|
|
||||||
{"type": "EndTag", "name": "p", "data": []}])
|
|
||||||
|
|
||||||
def testWhitespace(self):
|
|
||||||
self.runTest(
|
|
||||||
[{"type": "StartTag", "name": "p", "data": []},
|
|
||||||
{"type": "Characters", "data": "foo" + spaceCharacters + "bar"},
|
|
||||||
{"type": "EndTag", "name": "p", "data": []}],
|
|
||||||
[{"type": "StartTag", "name": "p", "data": []},
|
|
||||||
{"type": "Characters", "data": "foo bar"},
|
|
||||||
{"type": "EndTag", "name": "p", "data": []}])
|
|
||||||
|
|
||||||
def testLeadingWhitespaceInPre(self):
|
|
||||||
self.runTestUnmodifiedOutput(
|
|
||||||
[{"type": "StartTag", "name": "pre", "data": []},
|
|
||||||
{"type": "SpaceCharacters", "data": spaceCharacters},
|
|
||||||
{"type": "Characters", "data": "foo"},
|
|
||||||
{"type": "EndTag", "name": "pre", "data": []}])
|
|
||||||
|
|
||||||
def testLeadingWhitespaceAsCharactersInPre(self):
|
|
||||||
self.runTestUnmodifiedOutput(
|
|
||||||
[{"type": "StartTag", "name": "pre", "data": []},
|
|
||||||
{"type": "Characters", "data": spaceCharacters + "foo"},
|
|
||||||
{"type": "EndTag", "name": "pre", "data": []}])
|
|
||||||
|
|
||||||
def testTrailingWhitespaceInPre(self):
|
|
||||||
self.runTestUnmodifiedOutput(
|
|
||||||
[{"type": "StartTag", "name": "pre", "data": []},
|
|
||||||
{"type": "Characters", "data": "foo"},
|
|
||||||
{"type": "SpaceCharacters", "data": spaceCharacters},
|
|
||||||
{"type": "EndTag", "name": "pre", "data": []}])
|
|
||||||
|
|
||||||
def testTrailingWhitespaceAsCharactersInPre(self):
|
|
||||||
self.runTestUnmodifiedOutput(
|
|
||||||
[{"type": "StartTag", "name": "pre", "data": []},
|
|
||||||
{"type": "Characters", "data": "foo" + spaceCharacters},
|
|
||||||
{"type": "EndTag", "name": "pre", "data": []}])
|
|
||||||
|
|
||||||
def testWhitespaceInPre(self):
|
|
||||||
self.runTestUnmodifiedOutput(
|
|
||||||
[{"type": "StartTag", "name": "pre", "data": []},
|
|
||||||
{"type": "Characters", "data": "foo" + spaceCharacters + "bar"},
|
|
||||||
{"type": "EndTag", "name": "pre", "data": []}])
|
|
||||||
|
|
||||||
|
|
||||||
def buildTestSuite():
|
|
||||||
return unittest.defaultTestLoader.loadTestsFromName(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
buildTestSuite()
|
|
||||||
unittest.main()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,76 +0,0 @@
|
||||||
"""A collection of modules for building different kinds of tree from
|
|
||||||
HTML documents.
|
|
||||||
|
|
||||||
To create a treebuilder for a new type of tree, you need to do
|
|
||||||
implement several things:
|
|
||||||
|
|
||||||
1) A set of classes for various types of elements: Document, Doctype,
|
|
||||||
Comment, Element. These must implement the interface of
|
|
||||||
_base.treebuilders.Node (although comment nodes have a different
|
|
||||||
signature for their constructor, see treebuilders.etree.Comment)
|
|
||||||
Textual content may also be implemented as another node type, or not, as
|
|
||||||
your tree implementation requires.
|
|
||||||
|
|
||||||
2) A treebuilder object (called TreeBuilder by convention) that
|
|
||||||
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
|
|
||||||
documentClass - the class to use for the bottommost node of a document
|
|
||||||
elementClass - the class to use for HTML Elements
|
|
||||||
commentClass - the class to use for comments
|
|
||||||
doctypeClass - the class to use for doctypes
|
|
||||||
It also has one required method:
|
|
||||||
getDocument - Returns the root node of the complete document tree
|
|
||||||
|
|
||||||
3) If you wish to run the unit tests, you must also create a
|
|
||||||
testSerializer method on your treebuilder which accepts a node and
|
|
||||||
returns a string containing Node and its children serialized according
|
|
||||||
to the format used in the unittests
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from ..utils import default_etree
|
|
||||||
|
|
||||||
treeBuilderCache = {}
|
|
||||||
|
|
||||||
|
|
||||||
def getTreeBuilder(treeType, implementation=None, **kwargs):
|
|
||||||
"""Get a TreeBuilder class for various types of tree with built-in support
|
|
||||||
|
|
||||||
treeType - the name of the tree type required (case-insensitive). Supported
|
|
||||||
values are:
|
|
||||||
|
|
||||||
"dom" - A generic builder for DOM implementations, defaulting to
|
|
||||||
a xml.dom.minidom based implementation.
|
|
||||||
"etree" - A generic builder for tree implementations exposing an
|
|
||||||
ElementTree-like interface, defaulting to
|
|
||||||
xml.etree.cElementTree if available and
|
|
||||||
xml.etree.ElementTree if not.
|
|
||||||
"lxml" - A etree-based builder for lxml.etree, handling
|
|
||||||
limitations of lxml's implementation.
|
|
||||||
|
|
||||||
implementation - (Currently applies to the "etree" and "dom" tree types). A
|
|
||||||
module implementing the tree type e.g.
|
|
||||||
xml.etree.ElementTree or xml.etree.cElementTree."""
|
|
||||||
|
|
||||||
treeType = treeType.lower()
|
|
||||||
if treeType not in treeBuilderCache:
|
|
||||||
if treeType == "dom":
|
|
||||||
from . import dom
|
|
||||||
# Come up with a sane default (pref. from the stdlib)
|
|
||||||
if implementation is None:
|
|
||||||
from xml.dom import minidom
|
|
||||||
implementation = minidom
|
|
||||||
# NEVER cache here, caching is done in the dom submodule
|
|
||||||
return dom.getDomModule(implementation, **kwargs).TreeBuilder
|
|
||||||
elif treeType == "lxml":
|
|
||||||
from . import etree_lxml
|
|
||||||
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
|
|
||||||
elif treeType == "etree":
|
|
||||||
from . import etree
|
|
||||||
if implementation is None:
|
|
||||||
implementation = default_etree
|
|
||||||
# NEVER cache here, caching is done in the etree submodule
|
|
||||||
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
|
|
||||||
else:
|
|
||||||
raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
|
|
||||||
return treeBuilderCache.get(treeType)
|
|
|
@ -1,57 +0,0 @@
|
||||||
"""A collection of modules for iterating through different kinds of
|
|
||||||
tree, generating tokens identical to those produced by the tokenizer
|
|
||||||
module.
|
|
||||||
|
|
||||||
To create a tree walker for a new type of tree, you need to do
|
|
||||||
implement a tree walker object (called TreeWalker by convention) that
|
|
||||||
implements a 'serialize' method taking a tree as sole argument and
|
|
||||||
returning an iterator generating tokens.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from ..utils import default_etree
|
|
||||||
|
|
||||||
treeWalkerCache = {}
|
|
||||||
|
|
||||||
|
|
||||||
def getTreeWalker(treeType, implementation=None, **kwargs):
|
|
||||||
"""Get a TreeWalker class for various types of tree with built-in support
|
|
||||||
|
|
||||||
treeType - the name of the tree type required (case-insensitive). Supported
|
|
||||||
values are:
|
|
||||||
|
|
||||||
"dom" - The xml.dom.minidom DOM implementation
|
|
||||||
"pulldom" - The xml.dom.pulldom event stream
|
|
||||||
"etree" - A generic walker for tree implementations exposing an
|
|
||||||
elementtree-like interface (known to work with
|
|
||||||
ElementTree, cElementTree and lxml.etree).
|
|
||||||
"lxml" - Optimized walker for lxml.etree
|
|
||||||
"genshi" - a Genshi stream
|
|
||||||
|
|
||||||
implementation - (Currently applies to the "etree" tree type only). A module
|
|
||||||
implementing the tree type e.g. xml.etree.ElementTree or
|
|
||||||
cElementTree."""
|
|
||||||
|
|
||||||
treeType = treeType.lower()
|
|
||||||
if treeType not in treeWalkerCache:
|
|
||||||
if treeType in ("dom", "pulldom"):
|
|
||||||
name = "%s.%s" % (__name__, treeType)
|
|
||||||
__import__(name)
|
|
||||||
mod = sys.modules[name]
|
|
||||||
treeWalkerCache[treeType] = mod.TreeWalker
|
|
||||||
elif treeType == "genshi":
|
|
||||||
from . import genshistream
|
|
||||||
treeWalkerCache[treeType] = genshistream.TreeWalker
|
|
||||||
elif treeType == "lxml":
|
|
||||||
from . import lxmletree
|
|
||||||
treeWalkerCache[treeType] = lxmletree.TreeWalker
|
|
||||||
elif treeType == "etree":
|
|
||||||
from . import etree
|
|
||||||
if implementation is None:
|
|
||||||
implementation = default_etree
|
|
||||||
# XXX: NEVER cache here, caching is done in the etree submodule
|
|
||||||
return etree.getETreeModule(implementation, **kwargs).TreeWalker
|
|
||||||
return treeWalkerCache.get(treeType)
|
|
|
@ -1,63 +0,0 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
|
|
||||||
COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
|
|
||||||
from ..constants import voidElements
|
|
||||||
|
|
||||||
|
|
||||||
class TreeWalker(_base.TreeWalker):
|
|
||||||
def __iter__(self):
|
|
||||||
ignore_until = None
|
|
||||||
previous = None
|
|
||||||
for event in self.tree:
|
|
||||||
if previous is not None and \
|
|
||||||
(ignore_until is None or previous[1] is ignore_until):
|
|
||||||
if previous[1] is ignore_until:
|
|
||||||
ignore_until = None
|
|
||||||
for token in self.tokens(previous, event):
|
|
||||||
yield token
|
|
||||||
if token["type"] == "EmptyTag":
|
|
||||||
ignore_until = previous[1]
|
|
||||||
previous = event
|
|
||||||
if ignore_until is None or previous[1] is ignore_until:
|
|
||||||
for token in self.tokens(previous, None):
|
|
||||||
yield token
|
|
||||||
elif ignore_until is not None:
|
|
||||||
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
|
|
||||||
|
|
||||||
def tokens(self, event, next):
|
|
||||||
type, node = event
|
|
||||||
if type == START_ELEMENT:
|
|
||||||
name = node.nodeName
|
|
||||||
namespace = node.namespaceURI
|
|
||||||
attrs = {}
|
|
||||||
for attr in list(node.attributes.keys()):
|
|
||||||
attr = node.getAttributeNode(attr)
|
|
||||||
attrs[(attr.namespaceURI, attr.localName)] = attr.value
|
|
||||||
if name in voidElements:
|
|
||||||
for token in self.emptyTag(namespace,
|
|
||||||
name,
|
|
||||||
attrs,
|
|
||||||
not next or next[1] is not node):
|
|
||||||
yield token
|
|
||||||
else:
|
|
||||||
yield self.startTag(namespace, name, attrs)
|
|
||||||
|
|
||||||
elif type == END_ELEMENT:
|
|
||||||
name = node.nodeName
|
|
||||||
namespace = node.namespaceURI
|
|
||||||
if name not in voidElements:
|
|
||||||
yield self.endTag(namespace, name)
|
|
||||||
|
|
||||||
elif type == COMMENT:
|
|
||||||
yield self.comment(node.nodeValue)
|
|
||||||
|
|
||||||
elif type in (IGNORABLE_WHITESPACE, CHARACTERS):
|
|
||||||
for token in self.text(node.nodeValue):
|
|
||||||
yield token
|
|
||||||
|
|
||||||
else:
|
|
||||||
yield self.unknown(type)
|
|
|
@ -1,16 +0,0 @@
|
||||||
#!/bin/bash -e
|
|
||||||
|
|
||||||
if [[ $USE_OPTIONAL != "true" && $USE_OPTIONAL != "false" ]]; then
|
|
||||||
echo "fatal: \$USE_OPTIONAL not set to true or false. Exiting."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
pip install -r requirements-test.txt
|
|
||||||
|
|
||||||
if [[ $USE_OPTIONAL == "true" && $TRAVIS_PYTHON_VERSION != "pypy" ]]; then
|
|
||||||
if [[ $TRAVIS_PYTHON_VERSION == "2.6" ]]; then
|
|
||||||
pip install --allow-external Genshi --allow-insecure Genshi -r requirements-optional-2.6.txt
|
|
||||||
else
|
|
||||||
pip install --allow-external Genshi --allow-insecure Genshi -r requirements-optional-cpython.txt
|
|
||||||
fi
|
|
||||||
fi
|
|
|
@ -1,5 +0,0 @@
|
||||||
-r requirements-optional-cpython.txt
|
|
||||||
|
|
||||||
# Can be used to force attributes to be serialized in alphabetical
|
|
||||||
# order.
|
|
||||||
ordereddict
|
|
|
@ -1,5 +0,0 @@
|
||||||
-r requirements-optional.txt
|
|
||||||
|
|
||||||
# lxml is supported with its own treebuilder ("lxml") and otherwise
|
|
||||||
# uses the standard ElementTree support
|
|
||||||
lxml
|
|
|
@ -1,13 +0,0 @@
|
||||||
-r requirements.txt
|
|
||||||
|
|
||||||
# We support a Genshi treewalker that can be used to serialize Genshi
|
|
||||||
# streams.
|
|
||||||
genshi
|
|
||||||
|
|
||||||
# DATrie can be used in place of our Python trie implementation for
|
|
||||||
# slightly better parsing performance.
|
|
||||||
datrie
|
|
||||||
|
|
||||||
# charade can be used as a fallback in case we are unable to determine
|
|
||||||
# the encoding of a document.
|
|
||||||
charade
|
|
|
@ -1,5 +0,0 @@
|
||||||
-r requirements.txt
|
|
||||||
|
|
||||||
flake8
|
|
||||||
nose
|
|
||||||
ordereddict # Python 2.6
|
|
|
@ -1,44 +0,0 @@
|
||||||
from distutils.core import setup
|
|
||||||
import os
|
|
||||||
import codecs
|
|
||||||
|
|
||||||
classifiers=[
|
|
||||||
'Development Status :: 5 - Production/Stable',
|
|
||||||
'Intended Audience :: Developers',
|
|
||||||
'License :: OSI Approved :: MIT License',
|
|
||||||
'Operating System :: OS Independent',
|
|
||||||
'Programming Language :: Python',
|
|
||||||
'Programming Language :: Python :: 2',
|
|
||||||
'Programming Language :: Python :: 2.6',
|
|
||||||
'Programming Language :: Python :: 2.7',
|
|
||||||
'Programming Language :: Python :: 3',
|
|
||||||
'Programming Language :: Python :: 3.2',
|
|
||||||
'Programming Language :: Python :: 3.3',
|
|
||||||
'Topic :: Software Development :: Libraries :: Python Modules',
|
|
||||||
'Topic :: Text Processing :: Markup :: HTML'
|
|
||||||
]
|
|
||||||
|
|
||||||
packages = ['html5lib'] + ['html5lib.'+name
|
|
||||||
for name in os.listdir(os.path.join('html5lib'))
|
|
||||||
if os.path.isdir(os.path.join('html5lib', name)) and
|
|
||||||
not name.startswith('.') and name != 'tests']
|
|
||||||
|
|
||||||
current_dir = os.path.dirname(__file__)
|
|
||||||
with codecs.open(os.path.join(current_dir, 'README.rst'), 'r', 'utf8') as readme_file:
|
|
||||||
with codecs.open(os.path.join(current_dir, 'CHANGES.rst'), 'r', 'utf8') as changes_file:
|
|
||||||
long_description = readme_file.read() + '\n' + changes_file.read()
|
|
||||||
|
|
||||||
setup(name='html5lib',
|
|
||||||
version='0.9999-dev',
|
|
||||||
url='https://github.com/html5lib/html5lib-python',
|
|
||||||
license="MIT License",
|
|
||||||
description='HTML parser based on the WHATWG HTML specifcation',
|
|
||||||
long_description=long_description,
|
|
||||||
classifiers=classifiers,
|
|
||||||
maintainer='James Graham',
|
|
||||||
maintainer_email='james@hoppipolla.co.uk',
|
|
||||||
packages=packages,
|
|
||||||
install_requires=[
|
|
||||||
'six',
|
|
||||||
],
|
|
||||||
)
|
|
|
@ -1,30 +0,0 @@
|
||||||
[tox]
|
|
||||||
envlist = py26,py27,py32,py33,py34,pypy
|
|
||||||
|
|
||||||
[testenv]
|
|
||||||
deps =
|
|
||||||
-r{toxinidir}/requirements-optional-cpython.txt
|
|
||||||
flake8
|
|
||||||
nose
|
|
||||||
commands =
|
|
||||||
{envbindir}/nosetests -q
|
|
||||||
{toxinidir}/flake8-run.sh
|
|
||||||
install_command =
|
|
||||||
pip install {opts} {packages}
|
|
||||||
|
|
||||||
[testenv:pypy]
|
|
||||||
# lxml doesn't work and datrie doesn't make sense
|
|
||||||
# (it's slower than the pure-python version)
|
|
||||||
deps =
|
|
||||||
charade
|
|
||||||
flake8
|
|
||||||
Genshi
|
|
||||||
nose
|
|
||||||
six
|
|
||||||
|
|
||||||
[testenv:py26]
|
|
||||||
basepython = python2.6
|
|
||||||
deps =
|
|
||||||
-r{toxinidir}/requirements-optional-2.6.txt
|
|
||||||
flake8
|
|
||||||
nose
|
|
|
@ -1,24 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
import sys
|
|
||||||
import urllib.request, urllib.error, urllib.parse
|
|
||||||
import codecs
|
|
||||||
|
|
||||||
def main():
|
|
||||||
encodings = []
|
|
||||||
f = urllib.request.urlopen(sys.argv[1])
|
|
||||||
for line in f:
|
|
||||||
if line.startswith("Name: ") or line.startswith("Alias: "):
|
|
||||||
enc = line.split()[1]
|
|
||||||
try:
|
|
||||||
codecs.lookup(enc)
|
|
||||||
if enc.lower not in encodings:
|
|
||||||
encodings.append(enc.lower())
|
|
||||||
except LookupError:
|
|
||||||
pass
|
|
||||||
sys.stdout.write("encodings = frozenset((\n")
|
|
||||||
for enc in encodings:
|
|
||||||
sys.stdout.write(' "%s",\n'%enc)
|
|
||||||
sys.stdout.write(' ))')
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,122 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree
|
|
||||||
|
|
||||||
usage:
|
|
||||||
import spider
|
|
||||||
s = spider.Spider()
|
|
||||||
s.spider("http://www.google.com", maxURLs=100)
|
|
||||||
"""
|
|
||||||
|
|
||||||
import urllib.request, urllib.error, urllib.parse
|
|
||||||
import urllib.robotparser
|
|
||||||
import md5
|
|
||||||
|
|
||||||
import httplib2
|
|
||||||
|
|
||||||
import html5lib
|
|
||||||
from html5lib.treebuilders import etree
|
|
||||||
|
|
||||||
class Spider(object):
|
|
||||||
def __init__(self):
|
|
||||||
self.unvisitedURLs = set()
|
|
||||||
self.visitedURLs = set()
|
|
||||||
self.buggyURLs=set()
|
|
||||||
self.robotParser = urllib.robotparser.RobotFileParser()
|
|
||||||
self.contentDigest = {}
|
|
||||||
self.http = httplib2.Http(".cache")
|
|
||||||
|
|
||||||
def run(self, initialURL, maxURLs=1000):
|
|
||||||
urlNumber = 0
|
|
||||||
self.visitedURLs.add(initialURL)
|
|
||||||
content = self.loadURL(initialURL)
|
|
||||||
while maxURLs is None or urlNumber < maxURLs:
|
|
||||||
if content is not None:
|
|
||||||
self.parse(content)
|
|
||||||
urlNumber += 1
|
|
||||||
if not self.unvisitedURLs:
|
|
||||||
break
|
|
||||||
content = self.loadURL(self.unvisitedURLs.pop())
|
|
||||||
|
|
||||||
def parse(self, content):
|
|
||||||
failed = False
|
|
||||||
p = html5lib.HTMLParser(tree=etree.TreeBuilder)
|
|
||||||
try:
|
|
||||||
tree = p.parse(content)
|
|
||||||
except:
|
|
||||||
self.buggyURLs.add(self.currentURL)
|
|
||||||
failed = True
|
|
||||||
print("BUGGY:", self.currentURL)
|
|
||||||
self.visitedURLs.add(self.currentURL)
|
|
||||||
if not failed:
|
|
||||||
self.updateURLs(tree)
|
|
||||||
|
|
||||||
def loadURL(self, url):
|
|
||||||
resp, content = self.http.request(url, "GET")
|
|
||||||
self.currentURL = url
|
|
||||||
digest = md5.md5(content).hexdigest()
|
|
||||||
if digest in self.contentDigest:
|
|
||||||
content = None
|
|
||||||
self.visitedURLs.add(url)
|
|
||||||
else:
|
|
||||||
self.contentDigest[digest] = url
|
|
||||||
|
|
||||||
if resp['status'] != "200":
|
|
||||||
content = None
|
|
||||||
|
|
||||||
return content
|
|
||||||
|
|
||||||
def updateURLs(self, tree):
|
|
||||||
"""Take all the links in the current document, extract the URLs and
|
|
||||||
update the list of visited and unvisited URLs according to whether we
|
|
||||||
have seen them before or not"""
|
|
||||||
urls = set()
|
|
||||||
#Remove all links we have already visited
|
|
||||||
for link in tree.findall(".//a"):
|
|
||||||
try:
|
|
||||||
url = urllib.parse.urldefrag(link.attrib['href'])[0]
|
|
||||||
if (url and url not in self.unvisitedURLs and url
|
|
||||||
not in self.visitedURLs):
|
|
||||||
urls.add(url)
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
#Remove all non-http URLs and a dd a sutiable base URL where that is
|
|
||||||
#missing
|
|
||||||
newUrls = set()
|
|
||||||
for url in urls:
|
|
||||||
splitURL = list(urllib.parse.urlsplit(url))
|
|
||||||
if splitURL[0] != "http":
|
|
||||||
continue
|
|
||||||
if splitURL[1] == "":
|
|
||||||
splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1]
|
|
||||||
newUrls.add(urllib.parse.urlunsplit(splitURL))
|
|
||||||
urls = newUrls
|
|
||||||
|
|
||||||
responseHeaders = {}
|
|
||||||
#Now we want to find the content types of the links we haven't visited
|
|
||||||
for url in urls:
|
|
||||||
try:
|
|
||||||
resp, content = self.http.request(url, "HEAD")
|
|
||||||
responseHeaders[url] = resp
|
|
||||||
except AttributeError as KeyError:
|
|
||||||
#Don't know why this happens
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
#Remove links not of content-type html or pages not found
|
|
||||||
#XXX - need to deal with other status codes?
|
|
||||||
toVisit = set([url for url in urls if url in responseHeaders and
|
|
||||||
"html" in responseHeaders[url]['content-type'] and
|
|
||||||
responseHeaders[url]['status'] == "200"])
|
|
||||||
|
|
||||||
#Now check we are allowed to spider the page
|
|
||||||
for url in toVisit:
|
|
||||||
robotURL = list(urllib.parse.urlsplit(url)[:2])
|
|
||||||
robotURL.extend(["robots.txt", "", ""])
|
|
||||||
robotURL = urllib.parse.urlunsplit(robotURL)
|
|
||||||
self.robotParser.set_url(robotURL)
|
|
||||||
if not self.robotParser.can_fetch("*", url):
|
|
||||||
toVisit.remove(url)
|
|
||||||
|
|
||||||
self.visitedURLs.update(urls)
|
|
||||||
self.unvisitedURLs.update(toVisit)
|
|
|
@ -5,14 +5,15 @@ here = os.path.abspath(os.path.split(__file__)[0])
|
||||||
repo_root = os.path.abspath(os.path.join(here, os.pardir))
|
repo_root = os.path.abspath(os.path.join(here, os.pardir))
|
||||||
|
|
||||||
sys.path.insert(0, os.path.join(here))
|
sys.path.insert(0, os.path.join(here))
|
||||||
sys.path.insert(0, os.path.join(here, "six"))
|
|
||||||
sys.path.insert(0, os.path.join(here, "html5lib"))
|
|
||||||
sys.path.insert(0, os.path.join(here, "wptserve"))
|
sys.path.insert(0, os.path.join(here, "wptserve"))
|
||||||
sys.path.insert(0, os.path.join(here, "pywebsocket"))
|
sys.path.insert(0, os.path.join(here, "pywebsocket"))
|
||||||
sys.path.insert(0, os.path.join(here, "third_party", "attrs", "src"))
|
sys.path.insert(0, os.path.join(here, "third_party", "attrs", "src"))
|
||||||
sys.path.insert(0, os.path.join(here, "third_party", "funcsigs"))
|
sys.path.insert(0, os.path.join(here, "third_party", "funcsigs"))
|
||||||
|
sys.path.insert(0, os.path.join(here, "third_party", "html5lib"))
|
||||||
sys.path.insert(0, os.path.join(here, "third_party", "pluggy"))
|
sys.path.insert(0, os.path.join(here, "third_party", "pluggy"))
|
||||||
sys.path.insert(0, os.path.join(here, "third_party", "py"))
|
sys.path.insert(0, os.path.join(here, "third_party", "py"))
|
||||||
sys.path.insert(0, os.path.join(here, "third_party", "pytest"))
|
sys.path.insert(0, os.path.join(here, "third_party", "pytest"))
|
||||||
|
sys.path.insert(0, os.path.join(here, "third_party", "six"))
|
||||||
|
sys.path.insert(0, os.path.join(here, "third_party", "webencodings"))
|
||||||
sys.path.insert(0, os.path.join(here, "webdriver"))
|
sys.path.insert(0, os.path.join(here, "webdriver"))
|
||||||
sys.path.insert(0, os.path.join(here, "wptrunner"))
|
sys.path.insert(0, os.path.join(here, "wptrunner"))
|
||||||
|
|
|
@ -1,9 +0,0 @@
|
||||||
*#
|
|
||||||
*.py[co]
|
|
||||||
*.sw[po]
|
|
||||||
*~
|
|
||||||
MANIFEST
|
|
||||||
documentation/_build
|
|
||||||
\#*
|
|
||||||
.tox
|
|
||||||
six.egg-info
|
|
|
@ -1,8 +0,0 @@
|
||||||
syntax: glob
|
|
||||||
*.pyc
|
|
||||||
dist
|
|
||||||
MANIFEST
|
|
||||||
documentation/_build
|
|
||||||
.tox
|
|
||||||
.gitignore
|
|
||||||
six.egg-info
|
|
|
@ -1,22 +0,0 @@
|
||||||
The primary author and maintainer of six is Benjamin Peterson. He would like to
|
|
||||||
acknowledge the following people who submitted bug reports, pull requests, and
|
|
||||||
otherwise worked to improve six:
|
|
||||||
|
|
||||||
Marc Abramowitz
|
|
||||||
Alexander Artemenko
|
|
||||||
Aymeric Augustin
|
|
||||||
Ned Batchelder
|
|
||||||
Jason R. Coombs
|
|
||||||
Julien Danjou
|
|
||||||
Ben Darnell
|
|
||||||
Ben Davis
|
|
||||||
Joshua Harlow
|
|
||||||
Anselm Kruis
|
|
||||||
Alexander Lukanin
|
|
||||||
James Mills
|
|
||||||
Sridhar Ratnakumar
|
|
||||||
Erik Rose
|
|
||||||
Peter Ruibal
|
|
||||||
Miroslav Shubernetskiy
|
|
||||||
|
|
||||||
If you think you belong on this list, please let me know! --Benjamin
|
|
|
@ -1,16 +0,0 @@
|
||||||
Six is a Python 2 and 3 compatibility library. It provides utility functions
|
|
||||||
for smoothing over the differences between the Python versions with the goal of
|
|
||||||
writing Python code that is compatible on both Python versions. See the
|
|
||||||
documentation for more information on what is provided.
|
|
||||||
|
|
||||||
Six supports every Python version since 2.5. It is contained in only one Python
|
|
||||||
file, so it can be easily copied into your project. (The copyright and license
|
|
||||||
notice must be retained.)
|
|
||||||
|
|
||||||
Online documentation is at http://pythonhosted.org/six/.
|
|
||||||
|
|
||||||
Bugs can be reported to https://bitbucket.org/gutworth/six. The code can also
|
|
||||||
be found there.
|
|
||||||
|
|
||||||
For questions about six or porting in general, email the python-porting mailing
|
|
||||||
list: http://mail.python.org/mailman/listinfo/python-porting
|
|
|
@ -1,2 +0,0 @@
|
||||||
[wheel]
|
|
||||||
universal = 1
|
|
|
@ -1,32 +0,0 @@
|
||||||
from __future__ import with_statement
|
|
||||||
|
|
||||||
try:
|
|
||||||
from setuptools import setup
|
|
||||||
except ImportError:
|
|
||||||
from distutils.core import setup
|
|
||||||
|
|
||||||
import six
|
|
||||||
|
|
||||||
six_classifiers = [
|
|
||||||
"Programming Language :: Python :: 2",
|
|
||||||
"Programming Language :: Python :: 3",
|
|
||||||
"Intended Audience :: Developers",
|
|
||||||
"License :: OSI Approved :: MIT License",
|
|
||||||
"Topic :: Software Development :: Libraries",
|
|
||||||
"Topic :: Utilities",
|
|
||||||
]
|
|
||||||
|
|
||||||
with open("README", "r") as fp:
|
|
||||||
six_long_description = fp.read()
|
|
||||||
|
|
||||||
setup(name="six",
|
|
||||||
version=six.__version__,
|
|
||||||
author="Benjamin Peterson",
|
|
||||||
author_email="benjamin@python.org",
|
|
||||||
url="http://pypi.python.org/pypi/six/",
|
|
||||||
py_modules=["six"],
|
|
||||||
description="Python 2 and 3 compatibility utilities",
|
|
||||||
long_description=six_long_description,
|
|
||||||
license="MIT",
|
|
||||||
classifiers=six_classifiers
|
|
||||||
)
|
|
|
@ -1,12 +0,0 @@
|
||||||
[tox]
|
|
||||||
envlist=py25,py26,py27,py31,py32,py33,py34,pypy
|
|
||||||
indexserver=
|
|
||||||
default = http://pypi.python.org/simple
|
|
||||||
testrun = http://pypi.testrun.org
|
|
||||||
|
|
||||||
[testenv]
|
|
||||||
deps=pytest
|
|
||||||
commands= py.test -rfsxX {posargs}
|
|
||||||
|
|
||||||
[pytest]
|
|
||||||
minversion=2.2.0
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
# To activate, change the Appveyor settings to use `.appveyor.yml`.
|
||||||
|
environment:
|
||||||
|
global:
|
||||||
|
PATH: "C:\\Python27\\Scripts\\;%PATH%"
|
||||||
|
PYTEST_COMMAND: "coverage run -m pytest"
|
||||||
|
matrix:
|
||||||
|
- TOXENV: py27-base
|
||||||
|
- TOXENV: py27-optional
|
||||||
|
- TOXENV: py33-base
|
||||||
|
- TOXENV: py33-optional
|
||||||
|
- TOXENV: py34-base
|
||||||
|
- TOXENV: py34-optional
|
||||||
|
- TOXENV: py35-base
|
||||||
|
- TOXENV: py35-optional
|
||||||
|
- TOXENV: py36-base
|
||||||
|
- TOXENV: py36-optional
|
||||||
|
|
||||||
|
install:
|
||||||
|
- git submodule update --init --recursive
|
||||||
|
- python -m pip install tox codecov
|
||||||
|
|
||||||
|
build: off
|
||||||
|
|
||||||
|
test_script:
|
||||||
|
- tox
|
||||||
|
|
||||||
|
after_test:
|
||||||
|
- python debug-info.py
|
||||||
|
|
||||||
|
on_success:
|
||||||
|
- codecov
|
|
@ -0,0 +1,8 @@
|
||||||
|
[run]
|
||||||
|
branch = True
|
||||||
|
source = html5lib
|
||||||
|
|
||||||
|
[paths]
|
||||||
|
source =
|
||||||
|
html5lib
|
||||||
|
.tox/*/lib/python*/site-packages/html5lib
|
|
@ -0,0 +1,85 @@
|
||||||
|
# Copyright (c) 2014 GitHub, Inc.
|
||||||
|
#
|
||||||
|
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
# copy of this software and associated documentation files (the "Software"),
|
||||||
|
# to deal in the Software without restriction, including without limitation
|
||||||
|
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
# and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
# Software is furnished to do so, subject to the following conditions:
|
||||||
|
#
|
||||||
|
# The above copyright notice and this permission notice shall be included in
|
||||||
|
# all copies or substantial portions of the Software.
|
||||||
|
#
|
||||||
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
# DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
env/
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*,cover
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
doc/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Generated by parse.py -p
|
||||||
|
stats.prof
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.idea
|
|
@ -0,0 +1,21 @@
|
||||||
|
strictness: veryhigh
|
||||||
|
doc-warnings: false
|
||||||
|
test-warnings: false
|
||||||
|
|
||||||
|
max-line-length: 139
|
||||||
|
|
||||||
|
requirements:
|
||||||
|
- requirements.txt
|
||||||
|
- requirements-test.txt
|
||||||
|
- requirements-optional.txt
|
||||||
|
|
||||||
|
ignore-paths:
|
||||||
|
- parse.py
|
||||||
|
- utils/
|
||||||
|
|
||||||
|
python-targets:
|
||||||
|
- 2
|
||||||
|
- 3
|
||||||
|
|
||||||
|
mccabe:
|
||||||
|
run: false
|
|
@ -0,0 +1,10 @@
|
||||||
|
[MASTER]
|
||||||
|
ignore=tests
|
||||||
|
|
||||||
|
[MESSAGES CONTROL]
|
||||||
|
# messages up to fixme should probably be fixed somehow
|
||||||
|
disable = redefined-builtin,attribute-defined-outside-init,anomalous-backslash-in-string,no-self-use,redefined-outer-name,bad-continuation,wrong-import-order,superfluous-parens,no-member,duplicate-code,super-init-not-called,abstract-method,property-on-old-class,wrong-import-position,no-name-in-module,no-init,bad-mcs-classmethod-argument,bad-classmethod-argument,fixme,invalid-name,import-error,too-few-public-methods,too-many-ancestors,too-many-arguments,too-many-boolean-expressions,too-many-branches,too-many-instance-attributes,too-many-locals,too-many-lines,too-many-public-methods,too-many-return-statements,too-many-statements,missing-docstring,line-too-long,locally-disabled,locally-enabled,bad-builtin,deprecated-lambda
|
||||||
|
|
||||||
|
[FORMAT]
|
||||||
|
max-line-length=139
|
||||||
|
single-line-if-stmt=no
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,32 @@
|
||||||
|
language: python
|
||||||
|
python:
|
||||||
|
- "pypy"
|
||||||
|
- "3.6"
|
||||||
|
- "3.5"
|
||||||
|
- "3.4"
|
||||||
|
- "3.3"
|
||||||
|
- "2.7"
|
||||||
|
|
||||||
|
sudo: false
|
||||||
|
|
||||||
|
cache: pip
|
||||||
|
|
||||||
|
env:
|
||||||
|
global:
|
||||||
|
- PYTEST_COMMAND="coverage run -m pytest"
|
||||||
|
matrix:
|
||||||
|
- TOXENV=optional
|
||||||
|
- TOXENV=base
|
||||||
|
- TOXENV=six19-optional
|
||||||
|
|
||||||
|
install:
|
||||||
|
- pip install tox codecov
|
||||||
|
|
||||||
|
script:
|
||||||
|
- tox
|
||||||
|
|
||||||
|
after_script:
|
||||||
|
- python debug-info.py
|
||||||
|
|
||||||
|
after_success:
|
||||||
|
- codecov
|
|
@ -6,6 +6,7 @@ Credits
|
||||||
- James Graham
|
- James Graham
|
||||||
- Geoffrey Sneddon
|
- Geoffrey Sneddon
|
||||||
- Łukasz Langa
|
- Łukasz Langa
|
||||||
|
- Will Kahn-Greene
|
||||||
|
|
||||||
|
|
||||||
Patches and suggestions
|
Patches and suggestions
|
||||||
|
@ -16,19 +17,50 @@ Patches and suggestions
|
||||||
- Lachlan Hunt
|
- Lachlan Hunt
|
||||||
- lantis63
|
- lantis63
|
||||||
- Sam Ruby
|
- Sam Ruby
|
||||||
- Tim Fletcher
|
|
||||||
- Thomas Broyer
|
- Thomas Broyer
|
||||||
|
- Tim Fletcher
|
||||||
- Mark Pilgrim
|
- Mark Pilgrim
|
||||||
- Philip Taylor
|
|
||||||
- Ryan King
|
- Ryan King
|
||||||
|
- Philip Taylor
|
||||||
- Edward Z. Yang
|
- Edward Z. Yang
|
||||||
- fantasai
|
- fantasai
|
||||||
- Philip Jägenstedt
|
- Philip Jägenstedt
|
||||||
- Ms2ger
|
- Ms2ger
|
||||||
|
- Mohammad Taha Jahangir
|
||||||
- Andy Wingo
|
- Andy Wingo
|
||||||
- Andreas Madsack
|
- Andreas Madsack
|
||||||
- Karim Valiev
|
- Karim Valiev
|
||||||
- Mohammad Taha Jahangir
|
|
||||||
- Juan Carlos Garcia Segovia
|
- Juan Carlos Garcia Segovia
|
||||||
- Mike West
|
- Mike West
|
||||||
- Marc DM
|
- Marc DM
|
||||||
|
- Simon Sapin
|
||||||
|
- Michael[tm] Smith
|
||||||
|
- Ritwik Gupta
|
||||||
|
- Marc Abramowitz
|
||||||
|
- Tony Lopes
|
||||||
|
- lilbludevil
|
||||||
|
- Kevin
|
||||||
|
- Drew Hubl
|
||||||
|
- Austin Kumbera
|
||||||
|
- Jim Baker
|
||||||
|
- Jon Dufresne
|
||||||
|
- Donald Stufft
|
||||||
|
- Alex Gaynor
|
||||||
|
- Nik Nyby
|
||||||
|
- Jakub Wilk
|
||||||
|
- Sigmund Cherem
|
||||||
|
- Gabi Davar
|
||||||
|
- Florian Mounier
|
||||||
|
- neumond
|
||||||
|
- Vitalik Verhovodov
|
||||||
|
- Kovid Goyal
|
||||||
|
- Adam Chainz
|
||||||
|
- John Vandenberg
|
||||||
|
- Eric Amorde
|
||||||
|
- Benedikt Morbach
|
||||||
|
- Jonathan Vanasco
|
||||||
|
- Tom Most
|
||||||
|
- Ville Skyttä
|
||||||
|
- Hugo van Kemenade
|
||||||
|
- Mark Vasilkov
|
||||||
|
|
|
@ -0,0 +1,335 @@
|
||||||
|
Change Log
|
||||||
|
----------
|
||||||
|
|
||||||
|
1.0.1
|
||||||
|
~~~~~
|
||||||
|
|
||||||
|
Released on December 7, 2017
|
||||||
|
|
||||||
|
Breaking changes:
|
||||||
|
|
||||||
|
* Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!)
|
||||||
|
* Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!)
|
||||||
|
|
||||||
|
Features:
|
||||||
|
|
||||||
|
* Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most,
|
||||||
|
Will Kahn-Greene!)
|
||||||
|
* Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!)
|
||||||
|
* Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!)
|
||||||
|
* Support Python 3.6. (#333) (Thank you, Jon Dufresne!)
|
||||||
|
* Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!)
|
||||||
|
* Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon
|
||||||
|
Dufresne, John Vandenberg, Geoffrey Sneddon, Will Kahn-Greene!)
|
||||||
|
* Semver-compliant version number.
|
||||||
|
|
||||||
|
Bug fixes:
|
||||||
|
|
||||||
|
* Add support for setuptools < 18.5 to support environment markers. (Thank you,
|
||||||
|
John Vandenberg!)
|
||||||
|
* Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!)
|
||||||
|
* Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank
|
||||||
|
you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!)
|
||||||
|
* Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will
|
||||||
|
Kahn-Greene!)
|
||||||
|
* Include license file in generated wheel package. (#350) (Thank you, Jon
|
||||||
|
Dufresne!)
|
||||||
|
* Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!)
|
||||||
|
* Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you,
|
||||||
|
Komal Dembla, Hugo!)
|
||||||
|
|
||||||
|
|
||||||
|
1.0
|
||||||
|
~~~
|
||||||
|
|
||||||
|
Released and unreleased on December 7, 2017. Badly packaged release.
|
||||||
|
|
||||||
|
|
||||||
|
0.999999999/1.0b10
|
||||||
|
~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Released on July 15, 2016
|
||||||
|
|
||||||
|
* Fix attribute order going to the tree builder to be document order
|
||||||
|
instead of reverse document order(!).
|
||||||
|
|
||||||
|
|
||||||
|
0.99999999/1.0b9
|
||||||
|
~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Released on July 14, 2016
|
||||||
|
|
||||||
|
* **Added ordereddict as a mandatory dependency on Python 2.6.**
|
||||||
|
|
||||||
|
* Added ``lxml``, ``genshi``, ``datrie``, ``charade``, and ``all``
|
||||||
|
extras that will do the right thing based on the specific
|
||||||
|
interpreter implementation.
|
||||||
|
|
||||||
|
* Now requires the ``mock`` package for the testsuite.
|
||||||
|
|
||||||
|
* Cease supporting DATrie under PyPy.
|
||||||
|
|
||||||
|
* **Remove PullDOM support, as this hasn't ever been properly
|
||||||
|
tested, doesn't entirely work, and as far as I can tell is
|
||||||
|
completely unused by anyone.**
|
||||||
|
|
||||||
|
* Move testsuite to ``py.test``.
|
||||||
|
|
||||||
|
* **Fix #124: move to webencodings for decoding the input byte stream;
|
||||||
|
this makes html5lib compliant with the Encoding Standard, and
|
||||||
|
introduces a required dependency on webencodings.**
|
||||||
|
|
||||||
|
* **Cease supporting Python 3.2 (in both CPython and PyPy forms).**
|
||||||
|
|
||||||
|
* **Fix comments containing double-dash with lxml 3.5 and above.**
|
||||||
|
|
||||||
|
* **Use scripting disabled by default (as we don't implement
|
||||||
|
scripting).**
|
||||||
|
|
||||||
|
* **Fix #11, avoiding the XSS bug potentially caused by serializer
|
||||||
|
allowing attribute values to be escaped out of in old browser versions,
|
||||||
|
changing the quote_attr_values option on serializer to take one of
|
||||||
|
three values, "always" (the old True value), "legacy" (the new option,
|
||||||
|
and the new default), and "spec" (the old False value, and the old
|
||||||
|
default).**
|
||||||
|
|
||||||
|
* **Fix #72 by rewriting the sanitizer to apply only to treewalkers
|
||||||
|
(instead of the tokenizer); as such, this will require amending all
|
||||||
|
callers of it to use it via the treewalker API.**
|
||||||
|
|
||||||
|
* **Drop support of charade, now that chardet is supported once more.**
|
||||||
|
|
||||||
|
* **Replace the charset keyword argument on parse and related methods
|
||||||
|
with a set of keyword arguments: override_encoding, transport_encoding,
|
||||||
|
same_origin_parent_encoding, likely_encoding, and default_encoding.**
|
||||||
|
|
||||||
|
* **Move filters._base, treebuilder._base, and treewalkers._base to .base
|
||||||
|
to clarify their status as public.**
|
||||||
|
|
||||||
|
* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the
|
||||||
|
sanitizer.htmlsanitizer module and move that to sanitizer. This means
|
||||||
|
anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no
|
||||||
|
code changes.**
|
||||||
|
|
||||||
|
* **Rename treewalkers.lxmletree to .etree_lxml and
|
||||||
|
treewalkers.genshistream to .genshi to have a consistent API.**
|
||||||
|
|
||||||
|
* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer,
|
||||||
|
utils) to be underscore prefixed to clarify their status as private.
|
||||||
|
|
||||||
|
|
||||||
|
0.9999999/1.0b8
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Released on September 10, 2015
|
||||||
|
|
||||||
|
* Fix #195: fix the sanitizer to drop broken URLs (it threw an
|
||||||
|
exception between 0.9999 and 0.999999).
|
||||||
|
|
||||||
|
|
||||||
|
0.999999/1.0b7
|
||||||
|
~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Released on July 7, 2015
|
||||||
|
|
||||||
|
* Fix #189: fix the sanitizer to allow relative URLs again (as it did
|
||||||
|
prior to 0.9999/1.0b5).
|
||||||
|
|
||||||
|
|
||||||
|
0.99999/1.0b6
|
||||||
|
~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Released on April 30, 2015
|
||||||
|
|
||||||
|
* Fix #188: fix the sanitizer to not throw an exception when sanitizing
|
||||||
|
bogus data URLs.
|
||||||
|
|
||||||
|
|
||||||
|
0.9999/1.0b5
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Released on April 29, 2015
|
||||||
|
|
||||||
|
* Fix #153: Sanitizer fails to treat some attributes as URLs. Despite how
|
||||||
|
this sounds, this has no known security implications. No known version
|
||||||
|
of IE (5.5 to current), Firefox (3 to current), Safari (6 to current),
|
||||||
|
Chrome (1 to current), or Opera (12 to current) will run any script
|
||||||
|
provided in these attributes.
|
||||||
|
|
||||||
|
* Pass error message to the ParseError exception in strict parsing mode.
|
||||||
|
|
||||||
|
* Allow data URIs in the sanitizer, with a whitelist of content-types.
|
||||||
|
|
||||||
|
* Add support for Python implementations that don't support lone
|
||||||
|
surrogates (read: Jython). Fixes #2.
|
||||||
|
|
||||||
|
* Remove localization of error messages. This functionality was totally
|
||||||
|
unused (and untested that everything was localizable), so we may as
|
||||||
|
well follow numerous browsers in not supporting translating technical
|
||||||
|
strings.
|
||||||
|
|
||||||
|
* Expose treewalkers.pprint as a public API.
|
||||||
|
|
||||||
|
* Add a documentEncoding property to HTML5Parser, fix #121.
|
||||||
|
|
||||||
|
|
||||||
|
0.999
|
||||||
|
~~~~~
|
||||||
|
|
||||||
|
Released on December 23, 2013
|
||||||
|
|
||||||
|
* Fix #127: add work-around for CPython issue #20007: .read(0) on
|
||||||
|
http.client.HTTPResponse drops the rest of the content.
|
||||||
|
|
||||||
|
* Fix #115: lxml treewalker can now deal with fragments containing, at
|
||||||
|
their root level, text nodes with non-ASCII characters on Python 2.
|
||||||
|
|
||||||
|
|
||||||
|
0.99
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
Released on September 10, 2013
|
||||||
|
|
||||||
|
* No library changes from 1.0b3; released as 0.99 as pip has changed
|
||||||
|
behaviour from 1.4 to avoid installing pre-release versions per
|
||||||
|
PEP 440.
|
||||||
|
|
||||||
|
|
||||||
|
1.0b3
|
||||||
|
~~~~~
|
||||||
|
|
||||||
|
Released on July 24, 2013
|
||||||
|
|
||||||
|
* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any
|
||||||
|
implementation using it should be moved to
|
||||||
|
``NonRecursiveTreeWalker``, as everything bundled with html5lib has
|
||||||
|
for years.
|
||||||
|
|
||||||
|
* Fix #67 so that ``BufferedStream`` to correctly returns a bytes
|
||||||
|
object, thereby fixing any case where html5lib is passed a
|
||||||
|
non-seekable RawIOBase-like object.
|
||||||
|
|
||||||
|
|
||||||
|
1.0b2
|
||||||
|
~~~~~
|
||||||
|
|
||||||
|
Released on June 27, 2013
|
||||||
|
|
||||||
|
* Removed reordering of attributes within the serializer. There is now
|
||||||
|
an ``alphabetical_attributes`` option which preserves the previous
|
||||||
|
behaviour through a new filter. This allows attribute order to be
|
||||||
|
preserved through html5lib if the tree builder preserves order.
|
||||||
|
|
||||||
|
* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
|
||||||
|
``treeadapters.sax.to_sax`` which is generic and supports any
|
||||||
|
treewalker; it also resolves all known bugs with ``dom2sax``.
|
||||||
|
|
||||||
|
* Fix treewalker assertions on hitting bytes strings on
|
||||||
|
Python 2. Previous to 1.0b1, treewalkers coped with mixed
|
||||||
|
bytes/unicode data on Python 2; this reintroduces this prior
|
||||||
|
behaviour on Python 2. Behaviour is unchanged on Python 3.
|
||||||
|
|
||||||
|
|
||||||
|
1.0b1
|
||||||
|
~~~~~
|
||||||
|
|
||||||
|
Released on May 17, 2013
|
||||||
|
|
||||||
|
* Implementation updated to implement the `HTML specification
|
||||||
|
<http://www.whatwg.org/specs/web-apps/current-work/>`_ as of 5th May
|
||||||
|
2013 (`SVN <http://svn.whatwg.org/webapps/>`_ revision r7867).
|
||||||
|
|
||||||
|
* Python 3.2+ supported in a single codebase using the ``six`` library.
|
||||||
|
|
||||||
|
* Removed support for Python 2.5 and older.
|
||||||
|
|
||||||
|
* Removed the deprecated Beautiful Soup 3 treebuilder.
|
||||||
|
``beautifulsoup4`` can use ``html5lib`` as a parser instead. Note that
|
||||||
|
since it doesn't support namespaces, foreign content like SVG and
|
||||||
|
MathML is parsed incorrectly.
|
||||||
|
|
||||||
|
* Removed ``simpletree`` from the package. The default tree builder is
|
||||||
|
now ``etree`` (using the ``xml.etree.cElementTree`` implementation if
|
||||||
|
available, and ``xml.etree.ElementTree`` otherwise).
|
||||||
|
|
||||||
|
* Removed the ``XHTMLSerializer`` as it never actually guaranteed its
|
||||||
|
output was well-formed XML, and hence provided little of use.
|
||||||
|
|
||||||
|
* Removed default DOM treebuilder, so ``html5lib.treebuilders.dom`` is no
|
||||||
|
longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
|
||||||
|
return the default DOM treebuilder, which uses ``xml.dom.minidom``.
|
||||||
|
|
||||||
|
* Optional heuristic character encoding detection now based on
|
||||||
|
``charade`` for Python 2.6 - 3.3 compatibility.
|
||||||
|
|
||||||
|
* Optional ``Genshi`` treewalker support fixed.
|
||||||
|
|
||||||
|
* Many bugfixes, including:
|
||||||
|
|
||||||
|
* #33: null in attribute value breaks XML AttValue;
|
||||||
|
|
||||||
|
* #4: nested, indirect descendant, <button> causes infinite loop;
|
||||||
|
|
||||||
|
* `Google Code 215
|
||||||
|
<http://code.google.com/p/html5lib/issues/detail?id=215>`_: Properly
|
||||||
|
detect seekable streams;
|
||||||
|
|
||||||
|
* `Google Code 206
|
||||||
|
<http://code.google.com/p/html5lib/issues/detail?id=206>`_: add
|
||||||
|
support for <video preload=...>, <audio preload=...>;
|
||||||
|
|
||||||
|
* `Google Code 205
|
||||||
|
<http://code.google.com/p/html5lib/issues/detail?id=205>`_: add
|
||||||
|
support for <video poster=...>;
|
||||||
|
|
||||||
|
* `Google Code 202
|
||||||
|
<http://code.google.com/p/html5lib/issues/detail?id=202>`_: Unicode
|
||||||
|
file breaks InputStream.
|
||||||
|
|
||||||
|
* Source code is now mostly PEP 8 compliant.
|
||||||
|
|
||||||
|
* Test harness has been improved and now depends on ``nose``.
|
||||||
|
|
||||||
|
* Documentation updated and moved to https://html5lib.readthedocs.io/.
|
||||||
|
|
||||||
|
|
||||||
|
0.95
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
Released on February 11, 2012
|
||||||
|
|
||||||
|
|
||||||
|
0.90
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
Released on January 17, 2010
|
||||||
|
|
||||||
|
|
||||||
|
0.11.1
|
||||||
|
~~~~~~
|
||||||
|
|
||||||
|
Released on June 12, 2008
|
||||||
|
|
||||||
|
|
||||||
|
0.11
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
Released on June 10, 2008
|
||||||
|
|
||||||
|
|
||||||
|
0.10
|
||||||
|
~~~~
|
||||||
|
|
||||||
|
Released on October 7, 2007
|
||||||
|
|
||||||
|
|
||||||
|
0.9
|
||||||
|
~~~
|
||||||
|
|
||||||
|
Released on March 11, 2007
|
||||||
|
|
||||||
|
|
||||||
|
0.2
|
||||||
|
~~~
|
||||||
|
|
||||||
|
Released on January 8, 2007
|
|
@ -1,6 +1,10 @@
|
||||||
include LICENSE
|
include LICENSE
|
||||||
|
include AUTHORS.rst
|
||||||
include CHANGES.rst
|
include CHANGES.rst
|
||||||
include README.rst
|
include README.rst
|
||||||
include requirements*.txt
|
include requirements*.txt
|
||||||
|
include .pytest.expect
|
||||||
|
include tox.ini
|
||||||
|
include pytest.ini
|
||||||
graft html5lib/tests/testdata
|
graft html5lib/tests/testdata
|
||||||
recursive-include html5lib/tests *.py
|
recursive-include html5lib/tests *.py
|
|
@ -51,7 +51,7 @@ pass into html5lib as follows:
|
||||||
import html5lib
|
import html5lib
|
||||||
|
|
||||||
with closing(urlopen("http://example.com/")) as f:
|
with closing(urlopen("http://example.com/")) as f:
|
||||||
document = html5lib.parse(f, encoding=f.info().getparam("charset"))
|
document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
|
||||||
|
|
||||||
When using with ``urllib.request`` (Python 3), the charset from HTTP
|
When using with ``urllib.request`` (Python 3), the charset from HTTP
|
||||||
should be pass into html5lib as follows:
|
should be pass into html5lib as follows:
|
||||||
|
@ -62,7 +62,7 @@ should be pass into html5lib as follows:
|
||||||
import html5lib
|
import html5lib
|
||||||
|
|
||||||
with urlopen("http://example.com/") as f:
|
with urlopen("http://example.com/") as f:
|
||||||
document = html5lib.parse(f, encoding=f.info().get_content_charset())
|
document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
|
||||||
|
|
||||||
To have more control over the parser, create a parser object explicitly.
|
To have more control over the parser, create a parser object explicitly.
|
||||||
For instance, to make the parser raise exceptions on parse errors, use:
|
For instance, to make the parser raise exceptions on parse errors, use:
|
||||||
|
@ -84,13 +84,13 @@ format:
|
||||||
parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
|
parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
|
||||||
minidom_document = parser.parse("<p>Hello World!")
|
minidom_document = parser.parse("<p>Hello World!")
|
||||||
|
|
||||||
More documentation is available at http://html5lib.readthedocs.org/.
|
More documentation is available at https://html5lib.readthedocs.io/.
|
||||||
|
|
||||||
|
|
||||||
Installation
|
Installation
|
||||||
------------
|
------------
|
||||||
|
|
||||||
html5lib works on CPython 2.6+, CPython 3.2+ and PyPy. To install it,
|
html5lib works on CPython 2.7+, CPython 3.3+ and PyPy. To install it,
|
||||||
use:
|
use:
|
||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
@ -104,8 +104,8 @@ Optional Dependencies
|
||||||
The following third-party libraries may be used for additional
|
The following third-party libraries may be used for additional
|
||||||
functionality:
|
functionality:
|
||||||
|
|
||||||
- ``datrie`` can be used to improve parsing performance (though in
|
- ``datrie`` can be used under CPython to improve parsing performance
|
||||||
almost all cases the improvement is marginal);
|
(though in almost all cases the improvement is marginal);
|
||||||
|
|
||||||
- ``lxml`` is supported as a tree format (for both building and
|
- ``lxml`` is supported as a tree format (for both building and
|
||||||
walking) under CPython (but *not* PyPy where it is known to cause
|
walking) under CPython (but *not* PyPy where it is known to cause
|
||||||
|
@ -113,13 +113,8 @@ functionality:
|
||||||
|
|
||||||
- ``genshi`` has a treewalker (but not builder); and
|
- ``genshi`` has a treewalker (but not builder); and
|
||||||
|
|
||||||
- ``charade`` can be used as a fallback when character encoding cannot
|
- ``chardet`` can be used as a fallback when character encoding cannot
|
||||||
be determined; ``chardet``, from which it was forked, can also be used
|
be determined.
|
||||||
on Python 2.
|
|
||||||
|
|
||||||
- ``ordereddict`` can be used under Python 2.6
|
|
||||||
(``collections.OrderedDict`` is used instead on later versions) to
|
|
||||||
serialize attributes in alphabetical order.
|
|
||||||
|
|
||||||
|
|
||||||
Bugs
|
Bugs
|
||||||
|
@ -132,9 +127,8 @@ Please report any bugs on the `issue tracker
|
||||||
Tests
|
Tests
|
||||||
-----
|
-----
|
||||||
|
|
||||||
Unit tests require the ``nose`` library and can be run using the
|
Unit tests require the ``pytest`` and ``mock`` libraries and can be
|
||||||
``nosetests`` command in the root directory; ``ordereddict`` is
|
run using the ``py.test`` command in the root directory.
|
||||||
required under Python 2.6. All should pass.
|
|
||||||
|
|
||||||
Test data are contained in a separate `html5lib-tests
|
Test data are contained in a separate `html5lib-tests
|
||||||
<https://github.com/html5lib/html5lib-tests>`_ repository and included
|
<https://github.com/html5lib/html5lib-tests>`_ repository and included
|
|
@ -12,7 +12,7 @@ info = {
|
||||||
"maxsize": sys.maxsize
|
"maxsize": sys.maxsize
|
||||||
}
|
}
|
||||||
|
|
||||||
search_modules = ["charade", "chardet", "datrie", "genshi", "html5lib", "lxml", "six"]
|
search_modules = ["chardet", "datrie", "genshi", "html5lib", "lxml", "six"]
|
||||||
found_modules = []
|
found_modules = []
|
||||||
|
|
||||||
for m in search_modules:
|
for m in search_modules:
|
|
@ -126,7 +126,7 @@ html_theme = 'default'
|
||||||
# Add any paths that contain custom static files (such as style sheets) here,
|
# Add any paths that contain custom static files (such as style sheets) here,
|
||||||
# relative to this directory. They are copied after the builtin static files,
|
# relative to this directory. They are copied after the builtin static files,
|
||||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||||
html_static_path = ['_static']
|
#html_static_path = ['_static']
|
||||||
|
|
||||||
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
|
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
|
||||||
# using the given strftime format.
|
# using the given strftime format.
|
|
@ -1,59 +1,58 @@
|
||||||
filters Package
|
filters Package
|
||||||
===============
|
===============
|
||||||
|
|
||||||
:mod:`_base` Module
|
:mod:`base` Module
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
.. automodule:: html5lib.filters._base
|
.. automodule:: html5lib.filters.base
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
:mod:`alphabeticalattributes` Module
|
:mod:`alphabeticalattributes` Module
|
||||||
------------------------------------
|
------------------------------------
|
||||||
|
|
||||||
.. automodule:: html5lib.filters.alphabeticalattributes
|
.. automodule:: html5lib.filters.alphabeticalattributes
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
:mod:`inject_meta_charset` Module
|
:mod:`inject_meta_charset` Module
|
||||||
---------------------------------
|
---------------------------------
|
||||||
|
|
||||||
.. automodule:: html5lib.filters.inject_meta_charset
|
.. automodule:: html5lib.filters.inject_meta_charset
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
:mod:`lint` Module
|
:mod:`lint` Module
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
.. automodule:: html5lib.filters.lint
|
.. automodule:: html5lib.filters.lint
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
:mod:`optionaltags` Module
|
:mod:`optionaltags` Module
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
||||||
.. automodule:: html5lib.filters.optionaltags
|
.. automodule:: html5lib.filters.optionaltags
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
:mod:`sanitizer` Module
|
:mod:`sanitizer` Module
|
||||||
-----------------------
|
-----------------------
|
||||||
|
|
||||||
.. automodule:: html5lib.filters.sanitizer
|
.. automodule:: html5lib.filters.sanitizer
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
:mod:`whitespace` Module
|
:mod:`whitespace` Module
|
||||||
------------------------
|
------------------------
|
||||||
|
|
||||||
.. automodule:: html5lib.filters.whitespace
|
.. automodule:: html5lib.filters.whitespace
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
|
@ -0,0 +1,38 @@
|
||||||
|
html5lib Package
|
||||||
|
================
|
||||||
|
|
||||||
|
.. automodule:: html5lib
|
||||||
|
:members: __version__
|
||||||
|
|
||||||
|
:mod:`constants` Module
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
.. automodule:: html5lib.constants
|
||||||
|
:members:
|
||||||
|
:show-inheritance:
|
||||||
|
|
||||||
|
:mod:`html5parser` Module
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
.. automodule:: html5lib.html5parser
|
||||||
|
:members:
|
||||||
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
|
:mod:`serializer` Module
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
.. automodule:: html5lib.serializer
|
||||||
|
:members:
|
||||||
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
|
Subpackages
|
||||||
|
-----------
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
|
||||||
|
html5lib.filters
|
||||||
|
html5lib.treebuilders
|
||||||
|
html5lib.treewalkers
|
||||||
|
html5lib.treeadapters
|
20
testing/web-platform/tests/tools/third_party/html5lib/doc/html5lib.treeadapters.rst
поставляемый
Normal file
20
testing/web-platform/tests/tools/third_party/html5lib/doc/html5lib.treeadapters.rst
поставляемый
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
treeadapters Package
|
||||||
|
====================
|
||||||
|
|
||||||
|
:mod:`~html5lib.treeadapters` Package
|
||||||
|
-------------------------------------
|
||||||
|
|
||||||
|
.. automodule:: html5lib.treeadapters
|
||||||
|
:members:
|
||||||
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
|
.. automodule:: html5lib.treeadapters.genshi
|
||||||
|
:members:
|
||||||
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
|
.. automodule:: html5lib.treeadapters.sax
|
||||||
|
:members:
|
||||||
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
|
@ -6,38 +6,37 @@ treebuilders Package
|
||||||
|
|
||||||
.. automodule:: html5lib.treebuilders
|
.. automodule:: html5lib.treebuilders
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
:mod:`_base` Module
|
:mod:`base` Module
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
.. automodule:: html5lib.treebuilders._base
|
.. automodule:: html5lib.treebuilders.base
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
:mod:`dom` Module
|
:mod:`dom` Module
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
.. automodule:: html5lib.treebuilders.dom
|
.. automodule:: html5lib.treebuilders.dom
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
:mod:`etree` Module
|
:mod:`etree` Module
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
.. automodule:: html5lib.treebuilders.etree
|
.. automodule:: html5lib.treebuilders.etree
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
:mod:`etree_lxml` Module
|
:mod:`etree_lxml` Module
|
||||||
------------------------
|
------------------------
|
||||||
|
|
||||||
.. automodule:: html5lib.treebuilders.etree_lxml
|
.. automodule:: html5lib.treebuilders.etree_lxml
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
50
testing/web-platform/tests/tools/third_party/html5lib/doc/html5lib.treewalkers.rst
поставляемый
Normal file
50
testing/web-platform/tests/tools/third_party/html5lib/doc/html5lib.treewalkers.rst
поставляемый
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
treewalkers Package
|
||||||
|
===================
|
||||||
|
|
||||||
|
:mod:`treewalkers` Package
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
.. automodule:: html5lib.treewalkers
|
||||||
|
:members:
|
||||||
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
|
:mod:`base` Module
|
||||||
|
------------------
|
||||||
|
|
||||||
|
.. automodule:: html5lib.treewalkers.base
|
||||||
|
:members:
|
||||||
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
|
:mod:`dom` Module
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
.. automodule:: html5lib.treewalkers.dom
|
||||||
|
:members:
|
||||||
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
|
:mod:`etree` Module
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
.. automodule:: html5lib.treewalkers.etree
|
||||||
|
:members:
|
||||||
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
|
:mod:`etree_lxml` Module
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
.. automodule:: html5lib.treewalkers.etree_lxml
|
||||||
|
:members:
|
||||||
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
||||||
|
|
||||||
|
:mod:`genshi` Module
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
.. automodule:: html5lib.treewalkers.genshi
|
||||||
|
:members:
|
||||||
|
:show-inheritance:
|
||||||
|
:special-members: __init__
|
|
@ -8,6 +8,7 @@ Overview
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
|
|
||||||
movingparts
|
movingparts
|
||||||
|
modules
|
||||||
changes
|
changes
|
||||||
License <license>
|
License <license>
|
||||||
|
|
|
@ -4,22 +4,25 @@ The moving parts
|
||||||
html5lib consists of a number of components, which are responsible for
|
html5lib consists of a number of components, which are responsible for
|
||||||
handling its features.
|
handling its features.
|
||||||
|
|
||||||
|
Parsing uses a *tree builder* to generate a *tree*, the in-memory representation of the document.
|
||||||
|
Several tree representations are supported, as are translations to other formats via *tree adapters*.
|
||||||
|
The tree may be translated to a token stream with a *tree walker*, from which :class:`~html5lib.serializer.HTMLSerializer` produces a stream of bytes.
|
||||||
|
The token stream may also be transformed by use of *filters* to accomplish tasks like sanitization.
|
||||||
|
|
||||||
Tree builders
|
Tree builders
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
The parser reads HTML by tokenizing the content and building a tree that
|
The parser reads HTML by tokenizing the content and building a tree that
|
||||||
the user can later access. There are three main types of trees that
|
the user can later access. html5lib can build three types of trees:
|
||||||
html5lib can build:
|
|
||||||
|
|
||||||
* ``etree`` - this is the default; builds a tree based on ``xml.etree``,
|
* ``etree`` - this is the default; builds a tree based on :mod:`xml.etree`,
|
||||||
which can be found in the standard library. Whenever possible, the
|
which can be found in the standard library. Whenever possible, the
|
||||||
accelerated ``ElementTree`` implementation (i.e.
|
accelerated ``ElementTree`` implementation (i.e.
|
||||||
``xml.etree.cElementTree`` on Python 2.x) is used.
|
``xml.etree.cElementTree`` on Python 2.x) is used.
|
||||||
|
|
||||||
* ``dom`` - builds a tree based on ``xml.dom.minidom``.
|
* ``dom`` - builds a tree based on :mod:`xml.dom.minidom`.
|
||||||
|
|
||||||
* ``lxml.etree`` - uses lxml's implementation of the ``ElementTree``
|
* ``lxml`` - uses the :mod:`lxml.etree` implementation of the ``ElementTree``
|
||||||
API. The performance gains are relatively small compared to using the
|
API. The performance gains are relatively small compared to using the
|
||||||
accelerated ``ElementTree`` module.
|
accelerated ``ElementTree`` module.
|
||||||
|
|
||||||
|
@ -31,21 +34,15 @@ You can specify the builder by name when using the shorthand API:
|
||||||
with open("mydocument.html", "rb") as f:
|
with open("mydocument.html", "rb") as f:
|
||||||
lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
|
lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
|
||||||
|
|
||||||
When instantiating a parser object, you have to pass a tree builder
|
To get a builder class by name, use the :func:`~html5lib.treebuilders.getTreeBuilder` function.
|
||||||
class in the ``tree`` keyword attribute:
|
|
||||||
|
When instantiating a :class:`~html5lib.html5parser.HTMLParser` object, you must pass a tree builder class via the ``tree`` keyword attribute:
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
import html5lib
|
import html5lib
|
||||||
parser = html5lib.HTMLParser(tree=SomeTreeBuilder)
|
TreeBuilder = html5lib.getTreeBuilder("dom")
|
||||||
document = parser.parse("<p>Hello World!")
|
parser = html5lib.HTMLParser(tree=TreeBuilder)
|
||||||
|
|
||||||
To get a builder class by name, use the ``getTreeBuilder`` function:
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
import html5lib
|
|
||||||
parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
|
|
||||||
minidom_document = parser.parse("<p>Hello World!")
|
minidom_document = parser.parse("<p>Hello World!")
|
||||||
|
|
||||||
The implementation of builders can be found in `html5lib/treebuilders/
|
The implementation of builders can be found in `html5lib/treebuilders/
|
||||||
|
@ -55,17 +52,13 @@ The implementation of builders can be found in `html5lib/treebuilders/
|
||||||
Tree walkers
|
Tree walkers
|
||||||
------------
|
------------
|
||||||
|
|
||||||
Once a tree is ready, you can work on it either manually, or using
|
In addition to manipulating a tree directly, you can use a tree walker to generate a streaming view of it.
|
||||||
a tree walker, which provides a streaming view of the tree. html5lib
|
html5lib provides walkers for ``etree``, ``dom``, and ``lxml`` trees, as well as ``genshi`` `markup streams <https://genshi.edgewall.org/wiki/Documentation/streams.html>`_.
|
||||||
provides walkers for all three supported types of trees (``etree``,
|
|
||||||
``dom`` and ``lxml``).
|
|
||||||
|
|
||||||
The implementation of walkers can be found in `html5lib/treewalkers/
|
The implementation of walkers can be found in `html5lib/treewalkers/
|
||||||
<https://github.com/html5lib/html5lib-python/tree/master/html5lib/treewalkers>`_.
|
<https://github.com/html5lib/html5lib-python/tree/master/html5lib/treewalkers>`_.
|
||||||
|
|
||||||
Walkers make consuming HTML easier. html5lib uses them to provide you
|
html5lib provides :class:`~html5lib.serializer.HTMLSerializer` for generating a stream of bytes from a token stream, and several filters which manipulate the stream.
|
||||||
with has a couple of handy tools.
|
|
||||||
|
|
||||||
|
|
||||||
HTMLSerializer
|
HTMLSerializer
|
||||||
~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~
|
||||||
|
@ -90,15 +83,14 @@ The serializer lets you write HTML back as a stream of bytes.
|
||||||
'>'
|
'>'
|
||||||
'Witam wszystkich'
|
'Witam wszystkich'
|
||||||
|
|
||||||
You can customize the serializer behaviour in a variety of ways, consult
|
You can customize the serializer behaviour in a variety of ways. Consult
|
||||||
the :class:`~html5lib.serializer.htmlserializer.HTMLSerializer`
|
the :class:`~html5lib.serializer.HTMLSerializer` documentation.
|
||||||
documentation.
|
|
||||||
|
|
||||||
|
|
||||||
Filters
|
Filters
|
||||||
~~~~~~~
|
~~~~~~~
|
||||||
|
|
||||||
You can alter the stream content with filters provided by html5lib:
|
html5lib provides several filters:
|
||||||
|
|
||||||
* :class:`alphabeticalattributes.Filter
|
* :class:`alphabeticalattributes.Filter
|
||||||
<html5lib.filters.alphabeticalattributes.Filter>` sorts attributes on
|
<html5lib.filters.alphabeticalattributes.Filter>` sorts attributes on
|
||||||
|
@ -110,11 +102,11 @@ You can alter the stream content with filters provided by html5lib:
|
||||||
the document
|
the document
|
||||||
|
|
||||||
* :class:`lint.Filter <html5lib.filters.lint.Filter>` raises
|
* :class:`lint.Filter <html5lib.filters.lint.Filter>` raises
|
||||||
``LintError`` exceptions on invalid tag and attribute names, invalid
|
:exc:`AssertionError` exceptions on invalid tag and attribute names, invalid
|
||||||
PCDATA, etc.
|
PCDATA, etc.
|
||||||
|
|
||||||
* :class:`optionaltags.Filter <html5lib.filters.optionaltags.Filter>`
|
* :class:`optionaltags.Filter <html5lib.filters.optionaltags.Filter>`
|
||||||
removes tags from the stream which are not necessary to produce valid
|
removes tags from the token stream which are not necessary to produce valid
|
||||||
HTML
|
HTML
|
||||||
|
|
||||||
* :class:`sanitizer.Filter <html5lib.filters.sanitizer.Filter>` removes
|
* :class:`sanitizer.Filter <html5lib.filters.sanitizer.Filter>` removes
|
||||||
|
@ -125,9 +117,9 @@ You can alter the stream content with filters provided by html5lib:
|
||||||
|
|
||||||
* :class:`whitespace.Filter <html5lib.filters.whitespace.Filter>`
|
* :class:`whitespace.Filter <html5lib.filters.whitespace.Filter>`
|
||||||
collapses all whitespace characters to single spaces unless they're in
|
collapses all whitespace characters to single spaces unless they're in
|
||||||
``<pre/>`` or ``textarea`` tags.
|
``<pre/>`` or ``<textarea/>`` tags.
|
||||||
|
|
||||||
To use a filter, simply wrap it around a stream:
|
To use a filter, simply wrap it around a token stream:
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
|
@ -136,15 +128,17 @@ To use a filter, simply wrap it around a stream:
|
||||||
>>> dom = html5lib.parse("<p><script>alert('Boo!')", treebuilder="dom")
|
>>> dom = html5lib.parse("<p><script>alert('Boo!')", treebuilder="dom")
|
||||||
>>> walker = html5lib.getTreeWalker("dom")
|
>>> walker = html5lib.getTreeWalker("dom")
|
||||||
>>> stream = walker(dom)
|
>>> stream = walker(dom)
|
||||||
>>> sane_stream = sanitizer.Filter(stream) clean_stream = sanitizer.Filter(stream)
|
>>> clean_stream = sanitizer.Filter(stream)
|
||||||
|
|
||||||
|
|
||||||
Tree adapters
|
Tree adapters
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
Used to translate one type of tree to another. More documentation
|
Tree adapters can be used to translate between tree formats.
|
||||||
pending, sorry.
|
Two adapters are provided by html5lib:
|
||||||
|
|
||||||
|
* :func:`html5lib.treeadapters.genshi.to_genshi()` generates a `Genshi markup stream <https://genshi.edgewall.org/wiki/Documentation/streams.html>`_.
|
||||||
|
* :func:`html5lib.treeadapters.sax.to_sax()` calls a SAX handler based on the tree.
|
||||||
|
|
||||||
Encoding discovery
|
Encoding discovery
|
||||||
------------------
|
------------------
|
||||||
|
@ -156,54 +150,16 @@ the following way:
|
||||||
* The encoding may be explicitly specified by passing the name of the
|
* The encoding may be explicitly specified by passing the name of the
|
||||||
encoding as the encoding parameter to the
|
encoding as the encoding parameter to the
|
||||||
:meth:`~html5lib.html5parser.HTMLParser.parse` method on
|
:meth:`~html5lib.html5parser.HTMLParser.parse` method on
|
||||||
``HTMLParser`` objects.
|
:class:`~html5lib.html5parser.HTMLParser` objects.
|
||||||
|
|
||||||
* If no encoding is specified, the parser will attempt to detect the
|
* If no encoding is specified, the parser will attempt to detect the
|
||||||
encoding from a ``<meta>`` element in the first 512 bytes of the
|
encoding from a ``<meta>`` element in the first 512 bytes of the
|
||||||
document (this is only a partial implementation of the current HTML
|
document (this is only a partial implementation of the current HTML
|
||||||
5 specification).
|
specification).
|
||||||
|
|
||||||
* If no encoding can be found and the chardet library is available, an
|
* If no encoding can be found and the :mod:`chardet` library is available, an
|
||||||
attempt will be made to sniff the encoding from the byte pattern.
|
attempt will be made to sniff the encoding from the byte pattern.
|
||||||
|
|
||||||
* If all else fails, the default encoding will be used. This is usually
|
* If all else fails, the default encoding will be used. This is usually
|
||||||
`Windows-1252 <http://en.wikipedia.org/wiki/Windows-1252>`_, which is
|
`Windows-1252 <http://en.wikipedia.org/wiki/Windows-1252>`_, which is
|
||||||
a common fallback used by Web browsers.
|
a common fallback used by Web browsers.
|
||||||
|
|
||||||
|
|
||||||
Tokenizers
|
|
||||||
----------
|
|
||||||
|
|
||||||
The part of the parser responsible for translating a raw input stream
|
|
||||||
into meaningful tokens is the tokenizer. Currently html5lib provides
|
|
||||||
two.
|
|
||||||
|
|
||||||
To set up a tokenizer, simply pass it when instantiating
|
|
||||||
a :class:`~html5lib.html5parser.HTMLParser`:
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
import html5lib
|
|
||||||
from html5lib import sanitizer
|
|
||||||
|
|
||||||
p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
|
|
||||||
p.parse("<p>Surprise!<script>alert('Boo!');</script>")
|
|
||||||
|
|
||||||
HTMLTokenizer
|
|
||||||
~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
This is the default tokenizer, the heart of html5lib. The implementation
|
|
||||||
can be found in `html5lib/tokenizer.py
|
|
||||||
<https://github.com/html5lib/html5lib-python/blob/master/html5lib/tokenizer.py>`_.
|
|
||||||
|
|
||||||
HTMLSanitizer
|
|
||||||
~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
This is a tokenizer that removes unsafe markup and CSS styles from the
|
|
||||||
input. Elements that are known to be safe are passed through and the
|
|
||||||
rest is converted to visible text. The default configuration of the
|
|
||||||
sanitizer follows the `WHATWG Sanitization Rules
|
|
||||||
<http://wiki.whatwg.org/wiki/Sanitization_rules>`_.
|
|
||||||
|
|
||||||
The implementation can be found in `html5lib/sanitizer.py
|
|
||||||
<https://github.com/html5lib/html5lib-python/blob/master/html5lib/sanitizer.py>`_.
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
#!/bin/bash -e
|
||||||
|
|
||||||
|
if [[ ! -x $(which flake8) ]]; then
|
||||||
|
echo "fatal: flake8 not found on $PATH. Exiting."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
flake8 `dirname $0`
|
||||||
|
exit $?
|
35
testing/web-platform/tests/tools/third_party/html5lib/html5lib/__init__.py
поставляемый
Normal file
35
testing/web-platform/tests/tools/third_party/html5lib/html5lib/__init__.py
поставляемый
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
"""
|
||||||
|
HTML parsing library based on the `WHATWG HTML specification
|
||||||
|
<https://whatwg.org/html>`_. The parser is designed to be compatible with
|
||||||
|
existing HTML found in the wild and implements well-defined error recovery that
|
||||||
|
is largely compatible with modern desktop web browsers.
|
||||||
|
|
||||||
|
Example usage::
|
||||||
|
|
||||||
|
import html5lib
|
||||||
|
with open("my_document.html", "rb") as f:
|
||||||
|
tree = html5lib.parse(f)
|
||||||
|
|
||||||
|
For convenience, this module re-exports the following names:
|
||||||
|
|
||||||
|
* :func:`~.html5parser.parse`
|
||||||
|
* :func:`~.html5parser.parseFragment`
|
||||||
|
* :class:`~.html5parser.HTMLParser`
|
||||||
|
* :func:`~.treebuilders.getTreeBuilder`
|
||||||
|
* :func:`~.treewalkers.getTreeWalker`
|
||||||
|
* :func:`~.serializer.serialize`
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from .html5parser import HTMLParser, parse, parseFragment
|
||||||
|
from .treebuilders import getTreeBuilder
|
||||||
|
from .treewalkers import getTreeWalker
|
||||||
|
from .serializer import serialize
|
||||||
|
|
||||||
|
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
|
||||||
|
"getTreeWalker", "serialize"]
|
||||||
|
|
||||||
|
# this has to be at the top level, see how setup.py parses this
|
||||||
|
#: Distribution version number.
|
||||||
|
__version__ = "1.0.1"
|
|
@ -175,18 +175,18 @@ def escapeRegexp(string):
|
||||||
return string
|
return string
|
||||||
|
|
||||||
# output from the above
|
# output from the above
|
||||||
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
|
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
|
||||||
|
|
||||||
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
|
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
|
||||||
|
|
||||||
# Simpler things
|
# Simpler things
|
||||||
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
|
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
|
||||||
|
|
||||||
|
|
||||||
class InfosetFilter(object):
|
class InfosetFilter(object):
|
||||||
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
|
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
|
||||||
|
|
||||||
def __init__(self, replaceChars=None,
|
def __init__(self,
|
||||||
dropXmlnsLocalName=False,
|
dropXmlnsLocalName=False,
|
||||||
dropXmlnsAttrNs=False,
|
dropXmlnsAttrNs=False,
|
||||||
preventDoubleDashComments=False,
|
preventDoubleDashComments=False,
|
||||||
|
@ -217,7 +217,7 @@ class InfosetFilter(object):
|
||||||
else:
|
else:
|
||||||
return self.toXmlName(name)
|
return self.toXmlName(name)
|
||||||
|
|
||||||
def coerceElement(self, name, namespace=None):
|
def coerceElement(self, name):
|
||||||
return self.toXmlName(name)
|
return self.toXmlName(name)
|
||||||
|
|
||||||
def coerceComment(self, data):
|
def coerceComment(self, data):
|
||||||
|
@ -225,11 +225,14 @@ class InfosetFilter(object):
|
||||||
while "--" in data:
|
while "--" in data:
|
||||||
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
|
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
|
||||||
data = data.replace("--", "- -")
|
data = data.replace("--", "- -")
|
||||||
|
if data.endswith("-"):
|
||||||
|
warnings.warn("Comments cannot end in a dash", DataLossWarning)
|
||||||
|
data += " "
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def coerceCharacters(self, data):
|
def coerceCharacters(self, data):
|
||||||
if self.replaceFormFeedCharacters:
|
if self.replaceFormFeedCharacters:
|
||||||
for i in range(data.count("\x0C")):
|
for _ in range(data.count("\x0C")):
|
||||||
warnings.warn("Text cannot contain U+000C", DataLossWarning)
|
warnings.warn("Text cannot contain U+000C", DataLossWarning)
|
||||||
data = data.replace("\x0C", " ")
|
data = data.replace("\x0C", " ")
|
||||||
# Other non-xml characters
|
# Other non-xml characters
|
|
@ -1,13 +1,16 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
from six import text_type
|
|
||||||
from six.moves import http_client
|
from six import text_type, binary_type
|
||||||
|
from six.moves import http_client, urllib
|
||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
import webencodings
|
||||||
|
|
||||||
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
||||||
from .constants import encodings, ReparseException
|
from .constants import _ReparseException
|
||||||
from . import utils
|
from . import _utils
|
||||||
|
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
|
|
||||||
|
@ -16,19 +19,26 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
BytesIO = StringIO
|
BytesIO = StringIO
|
||||||
|
|
||||||
try:
|
|
||||||
from io import BufferedIOBase
|
|
||||||
except ImportError:
|
|
||||||
class BufferedIOBase(object):
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Non-unicode versions of constants for use in the pre-parser
|
# Non-unicode versions of constants for use in the pre-parser
|
||||||
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
|
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
|
||||||
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
|
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
|
||||||
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
|
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
|
||||||
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
|
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
|
||||||
|
|
||||||
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
|
|
||||||
|
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
|
||||||
|
|
||||||
|
if _utils.supports_lone_surrogates:
|
||||||
|
# Use one extra step of indirection and create surrogates with
|
||||||
|
# eval. Not using this indirection would introduce an illegal
|
||||||
|
# unicode literal on platforms not supporting such lone
|
||||||
|
# surrogates.
|
||||||
|
assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
|
||||||
|
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
|
||||||
|
eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
|
||||||
|
"]")
|
||||||
|
else:
|
||||||
|
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
|
||||||
|
|
||||||
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||||||
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
||||||
|
@ -38,7 +48,7 @@ non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||||||
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
||||||
0x10FFFE, 0x10FFFF])
|
0x10FFFE, 0x10FFFF])
|
||||||
|
|
||||||
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
|
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
|
||||||
|
|
||||||
# Cache for charsUntil()
|
# Cache for charsUntil()
|
||||||
charsUntilRegEx = {}
|
charsUntilRegEx = {}
|
||||||
|
@ -118,10 +128,13 @@ class BufferedStream(object):
|
||||||
return b"".join(rv)
|
return b"".join(rv)
|
||||||
|
|
||||||
|
|
||||||
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
|
def HTMLInputStream(source, **kwargs):
|
||||||
if isinstance(source, http_client.HTTPResponse):
|
# Work around Python bug #20007: read(0) closes the connection.
|
||||||
# Work around Python bug #20007: read(0) closes the connection.
|
# http://bugs.python.org/issue20007
|
||||||
# http://bugs.python.org/issue20007
|
if (isinstance(source, http_client.HTTPResponse) or
|
||||||
|
# Also check for addinfourl wrapping HTTPResponse
|
||||||
|
(isinstance(source, urllib.response.addbase) and
|
||||||
|
isinstance(source.fp, http_client.HTTPResponse))):
|
||||||
isUnicode = False
|
isUnicode = False
|
||||||
elif hasattr(source, "read"):
|
elif hasattr(source, "read"):
|
||||||
isUnicode = isinstance(source.read(0), text_type)
|
isUnicode = isinstance(source.read(0), text_type)
|
||||||
|
@ -129,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
|
||||||
isUnicode = isinstance(source, text_type)
|
isUnicode = isinstance(source, text_type)
|
||||||
|
|
||||||
if isUnicode:
|
if isUnicode:
|
||||||
if encoding is not None:
|
encodings = [x for x in kwargs if x.endswith("_encoding")]
|
||||||
raise TypeError("Cannot explicitly set an encoding with a unicode string")
|
if encodings:
|
||||||
|
raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
|
||||||
|
|
||||||
return HTMLUnicodeInputStream(source)
|
return HTMLUnicodeInputStream(source, **kwargs)
|
||||||
else:
|
else:
|
||||||
return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
|
return HTMLBinaryInputStream(source, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class HTMLUnicodeInputStream(object):
|
class HTMLUnicodeInputStream(object):
|
||||||
|
@ -160,22 +174,21 @@ class HTMLUnicodeInputStream(object):
|
||||||
regardless of any BOM or later declaration (such as in a meta
|
regardless of any BOM or later declaration (such as in a meta
|
||||||
element)
|
element)
|
||||||
|
|
||||||
parseMeta - Look for a <meta> element containing encoding information
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Craziness
|
if not _utils.supports_lone_surrogates:
|
||||||
if len("\U0010FFFF") == 1:
|
# Such platforms will have already checked for such
|
||||||
|
# surrogate errors, so no need to do this checking.
|
||||||
|
self.reportCharacterErrors = None
|
||||||
|
elif len("\U0010FFFF") == 1:
|
||||||
self.reportCharacterErrors = self.characterErrorsUCS4
|
self.reportCharacterErrors = self.characterErrorsUCS4
|
||||||
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
|
|
||||||
else:
|
else:
|
||||||
self.reportCharacterErrors = self.characterErrorsUCS2
|
self.reportCharacterErrors = self.characterErrorsUCS2
|
||||||
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
|
|
||||||
|
|
||||||
# List of where new lines occur
|
# List of where new lines occur
|
||||||
self.newLines = [0]
|
self.newLines = [0]
|
||||||
|
|
||||||
self.charEncoding = ("utf-8", "certain")
|
self.charEncoding = (lookupEncoding("utf-8"), "certain")
|
||||||
self.dataStream = self.openStream(source)
|
self.dataStream = self.openStream(source)
|
||||||
|
|
||||||
self.reset()
|
self.reset()
|
||||||
|
@ -265,12 +278,10 @@ class HTMLUnicodeInputStream(object):
|
||||||
self._bufferedCharacter = data[-1]
|
self._bufferedCharacter = data[-1]
|
||||||
data = data[:-1]
|
data = data[:-1]
|
||||||
|
|
||||||
self.reportCharacterErrors(data)
|
if self.reportCharacterErrors:
|
||||||
|
self.reportCharacterErrors(data)
|
||||||
|
|
||||||
# Replace invalid characters
|
# Replace invalid characters
|
||||||
# Note U+0000 is dealt with in the tokenizer
|
|
||||||
data = self.replaceCharactersRegexp.sub("\ufffd", data)
|
|
||||||
|
|
||||||
data = data.replace("\r\n", "\n")
|
data = data.replace("\r\n", "\n")
|
||||||
data = data.replace("\r", "\n")
|
data = data.replace("\r", "\n")
|
||||||
|
|
||||||
|
@ -280,7 +291,7 @@ class HTMLUnicodeInputStream(object):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def characterErrorsUCS4(self, data):
|
def characterErrorsUCS4(self, data):
|
||||||
for i in range(len(invalid_unicode_re.findall(data))):
|
for _ in range(len(invalid_unicode_re.findall(data))):
|
||||||
self.errors.append("invalid-codepoint")
|
self.errors.append("invalid-codepoint")
|
||||||
|
|
||||||
def characterErrorsUCS2(self, data):
|
def characterErrorsUCS2(self, data):
|
||||||
|
@ -293,9 +304,9 @@ class HTMLUnicodeInputStream(object):
|
||||||
codepoint = ord(match.group())
|
codepoint = ord(match.group())
|
||||||
pos = match.start()
|
pos = match.start()
|
||||||
# Pretty sure there should be endianness issues here
|
# Pretty sure there should be endianness issues here
|
||||||
if utils.isSurrogatePair(data[pos:pos + 2]):
|
if _utils.isSurrogatePair(data[pos:pos + 2]):
|
||||||
# We have a surrogate pair!
|
# We have a surrogate pair!
|
||||||
char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
|
char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
|
||||||
if char_val in non_bmp_invalid_codepoints:
|
if char_val in non_bmp_invalid_codepoints:
|
||||||
self.errors.append("invalid-codepoint")
|
self.errors.append("invalid-codepoint")
|
||||||
skip = True
|
skip = True
|
||||||
|
@ -378,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
|
def __init__(self, source, override_encoding=None, transport_encoding=None,
|
||||||
|
same_origin_parent_encoding=None, likely_encoding=None,
|
||||||
|
default_encoding="windows-1252", useChardet=True):
|
||||||
"""Initialises the HTMLInputStream.
|
"""Initialises the HTMLInputStream.
|
||||||
|
|
||||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||||
|
@ -391,8 +404,6 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
||||||
regardless of any BOM or later declaration (such as in a meta
|
regardless of any BOM or later declaration (such as in a meta
|
||||||
element)
|
element)
|
||||||
|
|
||||||
parseMeta - Look for a <meta> element containing encoding information
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
||||||
# self.charEncoding as appropriate
|
# self.charEncoding as appropriate
|
||||||
|
@ -400,27 +411,28 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
||||||
|
|
||||||
HTMLUnicodeInputStream.__init__(self, self.rawStream)
|
HTMLUnicodeInputStream.__init__(self, self.rawStream)
|
||||||
|
|
||||||
self.charEncoding = (codecName(encoding), "certain")
|
|
||||||
|
|
||||||
# Encoding Information
|
# Encoding Information
|
||||||
# Number of bytes to use when looking for a meta element with
|
# Number of bytes to use when looking for a meta element with
|
||||||
# encoding information
|
# encoding information
|
||||||
self.numBytesMeta = 512
|
self.numBytesMeta = 1024
|
||||||
# Number of bytes to use when using detecting encoding using chardet
|
# Number of bytes to use when using detecting encoding using chardet
|
||||||
self.numBytesChardet = 100
|
self.numBytesChardet = 100
|
||||||
# Encoding to use if no other information can be found
|
# Things from args
|
||||||
self.defaultEncoding = "windows-1252"
|
self.override_encoding = override_encoding
|
||||||
|
self.transport_encoding = transport_encoding
|
||||||
|
self.same_origin_parent_encoding = same_origin_parent_encoding
|
||||||
|
self.likely_encoding = likely_encoding
|
||||||
|
self.default_encoding = default_encoding
|
||||||
|
|
||||||
# Detect encoding iff no explicit "transport level" encoding is supplied
|
# Determine encoding
|
||||||
if (self.charEncoding[0] is None):
|
self.charEncoding = self.determineEncoding(useChardet)
|
||||||
self.charEncoding = self.detectEncoding(parseMeta, chardet)
|
assert self.charEncoding[0] is not None
|
||||||
|
|
||||||
# Call superclass
|
# Call superclass
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
|
self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
|
||||||
'replace')
|
|
||||||
HTMLUnicodeInputStream.reset(self)
|
HTMLUnicodeInputStream.reset(self)
|
||||||
|
|
||||||
def openStream(self, source):
|
def openStream(self, source):
|
||||||
|
@ -437,29 +449,50 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
stream.seek(stream.tell())
|
stream.seek(stream.tell())
|
||||||
except:
|
except: # pylint:disable=bare-except
|
||||||
stream = BufferedStream(stream)
|
stream = BufferedStream(stream)
|
||||||
|
|
||||||
return stream
|
return stream
|
||||||
|
|
||||||
def detectEncoding(self, parseMeta=True, chardet=True):
|
def determineEncoding(self, chardet=True):
|
||||||
# First look for a BOM
|
# BOMs take precedence over everything
|
||||||
# This will also read past the BOM if present
|
# This will also read past the BOM if present
|
||||||
encoding = self.detectBOM()
|
charEncoding = self.detectBOM(), "certain"
|
||||||
confidence = "certain"
|
if charEncoding[0] is not None:
|
||||||
# If there is no BOM need to look for meta elements with encoding
|
return charEncoding
|
||||||
# information
|
|
||||||
if encoding is None and parseMeta:
|
# If we've been overriden, we've been overriden
|
||||||
encoding = self.detectEncodingMeta()
|
charEncoding = lookupEncoding(self.override_encoding), "certain"
|
||||||
confidence = "tentative"
|
if charEncoding[0] is not None:
|
||||||
# Guess with chardet, if avaliable
|
return charEncoding
|
||||||
if encoding is None and chardet:
|
|
||||||
confidence = "tentative"
|
# Now check the transport layer
|
||||||
|
charEncoding = lookupEncoding(self.transport_encoding), "certain"
|
||||||
|
if charEncoding[0] is not None:
|
||||||
|
return charEncoding
|
||||||
|
|
||||||
|
# Look for meta elements with encoding information
|
||||||
|
charEncoding = self.detectEncodingMeta(), "tentative"
|
||||||
|
if charEncoding[0] is not None:
|
||||||
|
return charEncoding
|
||||||
|
|
||||||
|
# Parent document encoding
|
||||||
|
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
|
||||||
|
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
|
||||||
|
return charEncoding
|
||||||
|
|
||||||
|
# "likely" encoding
|
||||||
|
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
|
||||||
|
if charEncoding[0] is not None:
|
||||||
|
return charEncoding
|
||||||
|
|
||||||
|
# Guess with chardet, if available
|
||||||
|
if chardet:
|
||||||
try:
|
try:
|
||||||
try:
|
from chardet.universaldetector import UniversalDetector
|
||||||
from charade.universaldetector import UniversalDetector
|
except ImportError:
|
||||||
except ImportError:
|
pass
|
||||||
from chardet.universaldetector import UniversalDetector
|
else:
|
||||||
buffers = []
|
buffers = []
|
||||||
detector = UniversalDetector()
|
detector = UniversalDetector()
|
||||||
while not detector.done:
|
while not detector.done:
|
||||||
|
@ -470,37 +503,34 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
||||||
buffers.append(buffer)
|
buffers.append(buffer)
|
||||||
detector.feed(buffer)
|
detector.feed(buffer)
|
||||||
detector.close()
|
detector.close()
|
||||||
encoding = detector.result['encoding']
|
encoding = lookupEncoding(detector.result['encoding'])
|
||||||
self.rawStream.seek(0)
|
self.rawStream.seek(0)
|
||||||
except ImportError:
|
if encoding is not None:
|
||||||
pass
|
return encoding, "tentative"
|
||||||
# If all else fails use the default encoding
|
|
||||||
if encoding is None:
|
|
||||||
confidence = "tentative"
|
|
||||||
encoding = self.defaultEncoding
|
|
||||||
|
|
||||||
# Substitute for equivalent encodings:
|
# Try the default encoding
|
||||||
encodingSub = {"iso-8859-1": "windows-1252"}
|
charEncoding = lookupEncoding(self.default_encoding), "tentative"
|
||||||
|
if charEncoding[0] is not None:
|
||||||
|
return charEncoding
|
||||||
|
|
||||||
if encoding.lower() in encodingSub:
|
# Fallback to html5lib's default if even that hasn't worked
|
||||||
encoding = encodingSub[encoding.lower()]
|
return lookupEncoding("windows-1252"), "tentative"
|
||||||
|
|
||||||
return encoding, confidence
|
|
||||||
|
|
||||||
def changeEncoding(self, newEncoding):
|
def changeEncoding(self, newEncoding):
|
||||||
assert self.charEncoding[1] != "certain"
|
assert self.charEncoding[1] != "certain"
|
||||||
newEncoding = codecName(newEncoding)
|
newEncoding = lookupEncoding(newEncoding)
|
||||||
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
|
||||||
newEncoding = "utf-8"
|
|
||||||
if newEncoding is None:
|
if newEncoding is None:
|
||||||
return
|
return
|
||||||
|
if newEncoding.name in ("utf-16be", "utf-16le"):
|
||||||
|
newEncoding = lookupEncoding("utf-8")
|
||||||
|
assert newEncoding is not None
|
||||||
elif newEncoding == self.charEncoding[0]:
|
elif newEncoding == self.charEncoding[0]:
|
||||||
self.charEncoding = (self.charEncoding[0], "certain")
|
self.charEncoding = (self.charEncoding[0], "certain")
|
||||||
else:
|
else:
|
||||||
self.rawStream.seek(0)
|
self.rawStream.seek(0)
|
||||||
self.reset()
|
|
||||||
self.charEncoding = (newEncoding, "certain")
|
self.charEncoding = (newEncoding, "certain")
|
||||||
raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
|
self.reset()
|
||||||
|
raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
|
||||||
|
|
||||||
def detectBOM(self):
|
def detectBOM(self):
|
||||||
"""Attempts to detect at BOM at the start of the stream. If
|
"""Attempts to detect at BOM at the start of the stream. If
|
||||||
|
@ -508,8 +538,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
||||||
encoding otherwise return None"""
|
encoding otherwise return None"""
|
||||||
bomDict = {
|
bomDict = {
|
||||||
codecs.BOM_UTF8: 'utf-8',
|
codecs.BOM_UTF8: 'utf-8',
|
||||||
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
|
codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
|
||||||
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
|
codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
|
||||||
}
|
}
|
||||||
|
|
||||||
# Go to beginning of file and read in 4 bytes
|
# Go to beginning of file and read in 4 bytes
|
||||||
|
@ -529,9 +559,12 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
||||||
|
|
||||||
# Set the read position past the BOM if one was found, otherwise
|
# Set the read position past the BOM if one was found, otherwise
|
||||||
# set it to the start of the stream
|
# set it to the start of the stream
|
||||||
self.rawStream.seek(encoding and seek or 0)
|
if encoding:
|
||||||
|
self.rawStream.seek(seek)
|
||||||
return encoding
|
return lookupEncoding(encoding)
|
||||||
|
else:
|
||||||
|
self.rawStream.seek(0)
|
||||||
|
return None
|
||||||
|
|
||||||
def detectEncodingMeta(self):
|
def detectEncodingMeta(self):
|
||||||
"""Report the encoding declared by the meta element
|
"""Report the encoding declared by the meta element
|
||||||
|
@ -542,8 +575,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
||||||
self.rawStream.seek(0)
|
self.rawStream.seek(0)
|
||||||
encoding = parser.getEncoding()
|
encoding = parser.getEncoding()
|
||||||
|
|
||||||
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
|
||||||
encoding = "utf-8"
|
encoding = lookupEncoding("utf-8")
|
||||||
|
|
||||||
return encoding
|
return encoding
|
||||||
|
|
||||||
|
@ -557,6 +590,7 @@ class EncodingBytes(bytes):
|
||||||
return bytes.__new__(self, value.lower())
|
return bytes.__new__(self, value.lower())
|
||||||
|
|
||||||
def __init__(self, value):
|
def __init__(self, value):
|
||||||
|
# pylint:disable=unused-argument
|
||||||
self._position = -1
|
self._position = -1
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
@ -667,7 +701,7 @@ class EncodingParser(object):
|
||||||
(b"<!", self.handleOther),
|
(b"<!", self.handleOther),
|
||||||
(b"<?", self.handleOther),
|
(b"<?", self.handleOther),
|
||||||
(b"<", self.handlePossibleStartTag))
|
(b"<", self.handlePossibleStartTag))
|
||||||
for byte in self.data:
|
for _ in self.data:
|
||||||
keepParsing = True
|
keepParsing = True
|
||||||
for key, method in methodDispatch:
|
for key, method in methodDispatch:
|
||||||
if self.data.matchBytes(key):
|
if self.data.matchBytes(key):
|
||||||
|
@ -706,7 +740,7 @@ class EncodingParser(object):
|
||||||
return False
|
return False
|
||||||
elif attr[0] == b"charset":
|
elif attr[0] == b"charset":
|
||||||
tentativeEncoding = attr[1]
|
tentativeEncoding = attr[1]
|
||||||
codec = codecName(tentativeEncoding)
|
codec = lookupEncoding(tentativeEncoding)
|
||||||
if codec is not None:
|
if codec is not None:
|
||||||
self.encoding = codec
|
self.encoding = codec
|
||||||
return False
|
return False
|
||||||
|
@ -714,7 +748,7 @@ class EncodingParser(object):
|
||||||
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
||||||
tentativeEncoding = contentParser.parse()
|
tentativeEncoding = contentParser.parse()
|
||||||
if tentativeEncoding is not None:
|
if tentativeEncoding is not None:
|
||||||
codec = codecName(tentativeEncoding)
|
codec = lookupEncoding(tentativeEncoding)
|
||||||
if codec is not None:
|
if codec is not None:
|
||||||
if hasPragma:
|
if hasPragma:
|
||||||
self.encoding = codec
|
self.encoding = codec
|
||||||
|
@ -871,16 +905,19 @@ class ContentAttrParser(object):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def codecName(encoding):
|
def lookupEncoding(encoding):
|
||||||
"""Return the python codec name corresponding to an encoding or None if the
|
"""Return the python codec name corresponding to an encoding or None if the
|
||||||
string doesn't correspond to a valid encoding."""
|
string doesn't correspond to a valid encoding."""
|
||||||
if isinstance(encoding, bytes):
|
if isinstance(encoding, binary_type):
|
||||||
try:
|
try:
|
||||||
encoding = encoding.decode("ascii")
|
encoding = encoding.decode("ascii")
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
return None
|
return None
|
||||||
if encoding:
|
|
||||||
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
|
if encoding is not None:
|
||||||
return encodings.get(canonicalName, None)
|
try:
|
||||||
|
return webencodings.lookup(encoding)
|
||||||
|
except AttributeError:
|
||||||
|
return None
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
|
@ -1,9 +1,6 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
try:
|
from six import unichr as chr
|
||||||
chr = unichr # flake8: noqa
|
|
||||||
except NameError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
from collections import deque
|
from collections import deque
|
||||||
|
|
||||||
|
@ -14,9 +11,9 @@ from .constants import digits, hexDigits, EOF
|
||||||
from .constants import tokenTypes, tagTokenTypes
|
from .constants import tokenTypes, tagTokenTypes
|
||||||
from .constants import replacementCharacters
|
from .constants import replacementCharacters
|
||||||
|
|
||||||
from .inputstream import HTMLInputStream
|
from ._inputstream import HTMLInputStream
|
||||||
|
|
||||||
from .trie import Trie
|
from ._trie import Trie
|
||||||
|
|
||||||
entitiesTrie = Trie(entities)
|
entitiesTrie = Trie(entities)
|
||||||
|
|
||||||
|
@ -34,16 +31,11 @@ class HTMLTokenizer(object):
|
||||||
Points to HTMLInputStream object.
|
Points to HTMLInputStream object.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
def __init__(self, stream, parser=None, **kwargs):
|
||||||
lowercaseElementName=True, lowercaseAttrName=True, parser=None):
|
|
||||||
|
|
||||||
self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
|
self.stream = HTMLInputStream(stream, **kwargs)
|
||||||
self.parser = parser
|
self.parser = parser
|
||||||
|
|
||||||
# Perform case conversions?
|
|
||||||
self.lowercaseElementName = lowercaseElementName
|
|
||||||
self.lowercaseAttrName = lowercaseAttrName
|
|
||||||
|
|
||||||
# Setup the initial tokenizer state
|
# Setup the initial tokenizer state
|
||||||
self.escapeFlag = False
|
self.escapeFlag = False
|
||||||
self.lastFourChars = []
|
self.lastFourChars = []
|
||||||
|
@ -147,8 +139,8 @@ class HTMLTokenizer(object):
|
||||||
output = "&"
|
output = "&"
|
||||||
|
|
||||||
charStack = [self.stream.char()]
|
charStack = [self.stream.char()]
|
||||||
if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
|
if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
|
||||||
or (allowedChar is not None and allowedChar == charStack[0])):
|
(allowedChar is not None and allowedChar == charStack[0])):
|
||||||
self.stream.unget(charStack[0])
|
self.stream.unget(charStack[0])
|
||||||
|
|
||||||
elif charStack[0] == "#":
|
elif charStack[0] == "#":
|
||||||
|
@ -235,8 +227,7 @@ class HTMLTokenizer(object):
|
||||||
token = self.currentToken
|
token = self.currentToken
|
||||||
# Add token to the queue to be yielded
|
# Add token to the queue to be yielded
|
||||||
if (token["type"] in tagTokenTypes):
|
if (token["type"] in tagTokenTypes):
|
||||||
if self.lowercaseElementName:
|
token["name"] = token["name"].translate(asciiUpper2Lower)
|
||||||
token["name"] = token["name"].translate(asciiUpper2Lower)
|
|
||||||
if token["type"] == tokenTypes["EndTag"]:
|
if token["type"] == tokenTypes["EndTag"]:
|
||||||
if token["data"]:
|
if token["data"]:
|
||||||
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||||||
|
@ -921,10 +912,9 @@ class HTMLTokenizer(object):
|
||||||
# Attributes are not dropped at this stage. That happens when the
|
# Attributes are not dropped at this stage. That happens when the
|
||||||
# start tag token is emitted so values can still be safely appended
|
# start tag token is emitted so values can still be safely appended
|
||||||
# to attributes, but we do want to report the parse error in time.
|
# to attributes, but we do want to report the parse error in time.
|
||||||
if self.lowercaseAttrName:
|
self.currentToken["data"][-1][0] = (
|
||||||
self.currentToken["data"][-1][0] = (
|
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
|
||||||
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
|
for name, _ in self.currentToken["data"][:-1]:
|
||||||
for name, value in self.currentToken["data"][:-1]:
|
|
||||||
if self.currentToken["data"][-1][0] == name:
|
if self.currentToken["data"][-1][0] == name:
|
||||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||||
"duplicate-attribute"})
|
"duplicate-attribute"})
|
||||||
|
@ -1716,11 +1706,11 @@ class HTMLTokenizer(object):
|
||||||
else:
|
else:
|
||||||
data.append(char)
|
data.append(char)
|
||||||
|
|
||||||
data = "".join(data)
|
data = "".join(data) # pylint:disable=redefined-variable-type
|
||||||
# Deal with null here rather than in the parser
|
# Deal with null here rather than in the parser
|
||||||
nullCount = data.count("\u0000")
|
nullCount = data.count("\u0000")
|
||||||
if nullCount > 0:
|
if nullCount > 0:
|
||||||
for i in range(nullCount):
|
for _ in range(nullCount):
|
||||||
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||||||
"data": "invalid-codepoint"})
|
"data": "invalid-codepoint"})
|
||||||
data = data.replace("\u0000", "\uFFFD")
|
data = data.replace("\u0000", "\uFFFD")
|
|
@ -4,9 +4,11 @@ from .py import Trie as PyTrie
|
||||||
|
|
||||||
Trie = PyTrie
|
Trie = PyTrie
|
||||||
|
|
||||||
|
# pylint:disable=wrong-import-position
|
||||||
try:
|
try:
|
||||||
from .datrie import Trie as DATrie
|
from .datrie import Trie as DATrie
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
Trie = DATrie
|
Trie = DATrie
|
||||||
|
# pylint:enable=wrong-import-position
|
|
@ -7,13 +7,13 @@ class Trie(Mapping):
|
||||||
"""Abstract base class for tries"""
|
"""Abstract base class for tries"""
|
||||||
|
|
||||||
def keys(self, prefix=None):
|
def keys(self, prefix=None):
|
||||||
keys = super().keys()
|
# pylint:disable=arguments-differ
|
||||||
|
keys = super(Trie, self).keys()
|
||||||
|
|
||||||
if prefix is None:
|
if prefix is None:
|
||||||
return set(keys)
|
return set(keys)
|
||||||
|
|
||||||
# Python 2.6: no set comprehensions
|
return {x for x in keys if x.startswith(prefix)}
|
||||||
return set([x for x in keys if x.startswith(prefix)])
|
|
||||||
|
|
||||||
def has_keys_with_prefix(self, prefix):
|
def has_keys_with_prefix(self, prefix):
|
||||||
for key in self.keys():
|
for key in self.keys():
|
|
@ -2,6 +2,8 @@ from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
from types import ModuleType
|
from types import ModuleType
|
||||||
|
|
||||||
|
from six import text_type
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import xml.etree.cElementTree as default_etree
|
import xml.etree.cElementTree as default_etree
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -9,7 +11,26 @@ except ImportError:
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
|
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
|
||||||
"surrogatePairToCodepoint", "moduleFactoryFactory"]
|
"surrogatePairToCodepoint", "moduleFactoryFactory",
|
||||||
|
"supports_lone_surrogates"]
|
||||||
|
|
||||||
|
|
||||||
|
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
|
||||||
|
# caught by the below test. In general this would be any platform
|
||||||
|
# using UTF-16 as its encoding of unicode strings, such as
|
||||||
|
# Jython. This is because UTF-16 itself is based on the use of such
|
||||||
|
# surrogates, and there is no mechanism to further escape such
|
||||||
|
# escapes.
|
||||||
|
try:
|
||||||
|
_x = eval('"\\uD800"') # pylint:disable=eval-used
|
||||||
|
if not isinstance(_x, text_type):
|
||||||
|
# We need this with u"" because of http://bugs.jython.org/issue2039
|
||||||
|
_x = eval('u"\\uD800"') # pylint:disable=eval-used
|
||||||
|
assert isinstance(_x, text_type)
|
||||||
|
except: # pylint:disable=bare-except
|
||||||
|
supports_lone_surrogates = False
|
||||||
|
else:
|
||||||
|
supports_lone_surrogates = True
|
||||||
|
|
||||||
|
|
||||||
class MethodDispatcher(dict):
|
class MethodDispatcher(dict):
|
||||||
|
@ -31,19 +52,20 @@ class MethodDispatcher(dict):
|
||||||
# anything here.
|
# anything here.
|
||||||
_dictEntries = []
|
_dictEntries = []
|
||||||
for name, value in items:
|
for name, value in items:
|
||||||
if type(name) in (list, tuple, frozenset, set):
|
if isinstance(name, (list, tuple, frozenset, set)):
|
||||||
for item in name:
|
for item in name:
|
||||||
_dictEntries.append((item, value))
|
_dictEntries.append((item, value))
|
||||||
else:
|
else:
|
||||||
_dictEntries.append((name, value))
|
_dictEntries.append((name, value))
|
||||||
dict.__init__(self, _dictEntries)
|
dict.__init__(self, _dictEntries)
|
||||||
|
assert len(self) == len(_dictEntries)
|
||||||
self.default = None
|
self.default = None
|
||||||
|
|
||||||
def __getitem__(self, key):
|
def __getitem__(self, key):
|
||||||
return dict.get(self, key, self.default)
|
return dict.get(self, key, self.default)
|
||||||
|
|
||||||
|
|
||||||
# Some utility functions to dal with weirdness around UCS2 vs UCS4
|
# Some utility functions to deal with weirdness around UCS2 vs UCS4
|
||||||
# python builds
|
# python builds
|
||||||
|
|
||||||
def isSurrogatePair(data):
|
def isSurrogatePair(data):
|
||||||
|
@ -70,13 +92,33 @@ def moduleFactoryFactory(factory):
|
||||||
else:
|
else:
|
||||||
name = b"_%s_factory" % baseModule.__name__
|
name = b"_%s_factory" % baseModule.__name__
|
||||||
|
|
||||||
if name in moduleCache:
|
kwargs_tuple = tuple(kwargs.items())
|
||||||
return moduleCache[name]
|
|
||||||
else:
|
try:
|
||||||
|
return moduleCache[name][args][kwargs_tuple]
|
||||||
|
except KeyError:
|
||||||
mod = ModuleType(name)
|
mod = ModuleType(name)
|
||||||
objs = factory(baseModule, *args, **kwargs)
|
objs = factory(baseModule, *args, **kwargs)
|
||||||
mod.__dict__.update(objs)
|
mod.__dict__.update(objs)
|
||||||
moduleCache[name] = mod
|
if "name" not in moduleCache:
|
||||||
|
moduleCache[name] = {}
|
||||||
|
if "args" not in moduleCache[name]:
|
||||||
|
moduleCache[name][args] = {}
|
||||||
|
if "kwargs" not in moduleCache[name][args]:
|
||||||
|
moduleCache[name][args][kwargs_tuple] = {}
|
||||||
|
moduleCache[name][args][kwargs_tuple] = mod
|
||||||
return mod
|
return mod
|
||||||
|
|
||||||
return moduleFactory
|
return moduleFactory
|
||||||
|
|
||||||
|
|
||||||
|
def memoize(func):
|
||||||
|
cache = {}
|
||||||
|
|
||||||
|
def wrapped(*args, **kwargs):
|
||||||
|
key = (tuple(args), tuple(kwargs.items()))
|
||||||
|
if key not in cache:
|
||||||
|
cache[key] = func(*args, **kwargs)
|
||||||
|
return cache[key]
|
||||||
|
|
||||||
|
return wrapped
|
|
@ -1,292 +1,296 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
import string
|
import string
|
||||||
import gettext
|
|
||||||
_ = gettext.gettext
|
|
||||||
|
|
||||||
EOF = None
|
EOF = None
|
||||||
|
|
||||||
E = {
|
E = {
|
||||||
"null-character":
|
"null-character":
|
||||||
_("Null character in input stream, replaced with U+FFFD."),
|
"Null character in input stream, replaced with U+FFFD.",
|
||||||
"invalid-codepoint":
|
"invalid-codepoint":
|
||||||
_("Invalid codepoint in stream."),
|
"Invalid codepoint in stream.",
|
||||||
"incorrectly-placed-solidus":
|
"incorrectly-placed-solidus":
|
||||||
_("Solidus (/) incorrectly placed in tag."),
|
"Solidus (/) incorrectly placed in tag.",
|
||||||
"incorrect-cr-newline-entity":
|
"incorrect-cr-newline-entity":
|
||||||
_("Incorrect CR newline entity, replaced with LF."),
|
"Incorrect CR newline entity, replaced with LF.",
|
||||||
"illegal-windows-1252-entity":
|
"illegal-windows-1252-entity":
|
||||||
_("Entity used with illegal number (windows-1252 reference)."),
|
"Entity used with illegal number (windows-1252 reference).",
|
||||||
"cant-convert-numeric-entity":
|
"cant-convert-numeric-entity":
|
||||||
_("Numeric entity couldn't be converted to character "
|
"Numeric entity couldn't be converted to character "
|
||||||
"(codepoint U+%(charAsInt)08x)."),
|
"(codepoint U+%(charAsInt)08x).",
|
||||||
"illegal-codepoint-for-numeric-entity":
|
"illegal-codepoint-for-numeric-entity":
|
||||||
_("Numeric entity represents an illegal codepoint: "
|
"Numeric entity represents an illegal codepoint: "
|
||||||
"U+%(charAsInt)08x."),
|
"U+%(charAsInt)08x.",
|
||||||
"numeric-entity-without-semicolon":
|
"numeric-entity-without-semicolon":
|
||||||
_("Numeric entity didn't end with ';'."),
|
"Numeric entity didn't end with ';'.",
|
||||||
"expected-numeric-entity-but-got-eof":
|
"expected-numeric-entity-but-got-eof":
|
||||||
_("Numeric entity expected. Got end of file instead."),
|
"Numeric entity expected. Got end of file instead.",
|
||||||
"expected-numeric-entity":
|
"expected-numeric-entity":
|
||||||
_("Numeric entity expected but none found."),
|
"Numeric entity expected but none found.",
|
||||||
"named-entity-without-semicolon":
|
"named-entity-without-semicolon":
|
||||||
_("Named entity didn't end with ';'."),
|
"Named entity didn't end with ';'.",
|
||||||
"expected-named-entity":
|
"expected-named-entity":
|
||||||
_("Named entity expected. Got none."),
|
"Named entity expected. Got none.",
|
||||||
"attributes-in-end-tag":
|
"attributes-in-end-tag":
|
||||||
_("End tag contains unexpected attributes."),
|
"End tag contains unexpected attributes.",
|
||||||
'self-closing-flag-on-end-tag':
|
'self-closing-flag-on-end-tag':
|
||||||
_("End tag contains unexpected self-closing flag."),
|
"End tag contains unexpected self-closing flag.",
|
||||||
"expected-tag-name-but-got-right-bracket":
|
"expected-tag-name-but-got-right-bracket":
|
||||||
_("Expected tag name. Got '>' instead."),
|
"Expected tag name. Got '>' instead.",
|
||||||
"expected-tag-name-but-got-question-mark":
|
"expected-tag-name-but-got-question-mark":
|
||||||
_("Expected tag name. Got '?' instead. (HTML doesn't "
|
"Expected tag name. Got '?' instead. (HTML doesn't "
|
||||||
"support processing instructions.)"),
|
"support processing instructions.)",
|
||||||
"expected-tag-name":
|
"expected-tag-name":
|
||||||
_("Expected tag name. Got something else instead"),
|
"Expected tag name. Got something else instead",
|
||||||
"expected-closing-tag-but-got-right-bracket":
|
"expected-closing-tag-but-got-right-bracket":
|
||||||
_("Expected closing tag. Got '>' instead. Ignoring '</>'."),
|
"Expected closing tag. Got '>' instead. Ignoring '</>'.",
|
||||||
"expected-closing-tag-but-got-eof":
|
"expected-closing-tag-but-got-eof":
|
||||||
_("Expected closing tag. Unexpected end of file."),
|
"Expected closing tag. Unexpected end of file.",
|
||||||
"expected-closing-tag-but-got-char":
|
"expected-closing-tag-but-got-char":
|
||||||
_("Expected closing tag. Unexpected character '%(data)s' found."),
|
"Expected closing tag. Unexpected character '%(data)s' found.",
|
||||||
"eof-in-tag-name":
|
"eof-in-tag-name":
|
||||||
_("Unexpected end of file in the tag name."),
|
"Unexpected end of file in the tag name.",
|
||||||
"expected-attribute-name-but-got-eof":
|
"expected-attribute-name-but-got-eof":
|
||||||
_("Unexpected end of file. Expected attribute name instead."),
|
"Unexpected end of file. Expected attribute name instead.",
|
||||||
"eof-in-attribute-name":
|
"eof-in-attribute-name":
|
||||||
_("Unexpected end of file in attribute name."),
|
"Unexpected end of file in attribute name.",
|
||||||
"invalid-character-in-attribute-name":
|
"invalid-character-in-attribute-name":
|
||||||
_("Invalid character in attribute name"),
|
"Invalid character in attribute name",
|
||||||
"duplicate-attribute":
|
"duplicate-attribute":
|
||||||
_("Dropped duplicate attribute on tag."),
|
"Dropped duplicate attribute on tag.",
|
||||||
"expected-end-of-tag-name-but-got-eof":
|
"expected-end-of-tag-name-but-got-eof":
|
||||||
_("Unexpected end of file. Expected = or end of tag."),
|
"Unexpected end of file. Expected = or end of tag.",
|
||||||
"expected-attribute-value-but-got-eof":
|
"expected-attribute-value-but-got-eof":
|
||||||
_("Unexpected end of file. Expected attribute value."),
|
"Unexpected end of file. Expected attribute value.",
|
||||||
"expected-attribute-value-but-got-right-bracket":
|
"expected-attribute-value-but-got-right-bracket":
|
||||||
_("Expected attribute value. Got '>' instead."),
|
"Expected attribute value. Got '>' instead.",
|
||||||
'equals-in-unquoted-attribute-value':
|
'equals-in-unquoted-attribute-value':
|
||||||
_("Unexpected = in unquoted attribute"),
|
"Unexpected = in unquoted attribute",
|
||||||
'unexpected-character-in-unquoted-attribute-value':
|
'unexpected-character-in-unquoted-attribute-value':
|
||||||
_("Unexpected character in unquoted attribute"),
|
"Unexpected character in unquoted attribute",
|
||||||
"invalid-character-after-attribute-name":
|
"invalid-character-after-attribute-name":
|
||||||
_("Unexpected character after attribute name."),
|
"Unexpected character after attribute name.",
|
||||||
"unexpected-character-after-attribute-value":
|
"unexpected-character-after-attribute-value":
|
||||||
_("Unexpected character after attribute value."),
|
"Unexpected character after attribute value.",
|
||||||
"eof-in-attribute-value-double-quote":
|
"eof-in-attribute-value-double-quote":
|
||||||
_("Unexpected end of file in attribute value (\")."),
|
"Unexpected end of file in attribute value (\").",
|
||||||
"eof-in-attribute-value-single-quote":
|
"eof-in-attribute-value-single-quote":
|
||||||
_("Unexpected end of file in attribute value (')."),
|
"Unexpected end of file in attribute value (').",
|
||||||
"eof-in-attribute-value-no-quotes":
|
"eof-in-attribute-value-no-quotes":
|
||||||
_("Unexpected end of file in attribute value."),
|
"Unexpected end of file in attribute value.",
|
||||||
"unexpected-EOF-after-solidus-in-tag":
|
"unexpected-EOF-after-solidus-in-tag":
|
||||||
_("Unexpected end of file in tag. Expected >"),
|
"Unexpected end of file in tag. Expected >",
|
||||||
"unexpected-character-after-solidus-in-tag":
|
"unexpected-character-after-solidus-in-tag":
|
||||||
_("Unexpected character after / in tag. Expected >"),
|
"Unexpected character after / in tag. Expected >",
|
||||||
"expected-dashes-or-doctype":
|
"expected-dashes-or-doctype":
|
||||||
_("Expected '--' or 'DOCTYPE'. Not found."),
|
"Expected '--' or 'DOCTYPE'. Not found.",
|
||||||
"unexpected-bang-after-double-dash-in-comment":
|
"unexpected-bang-after-double-dash-in-comment":
|
||||||
_("Unexpected ! after -- in comment"),
|
"Unexpected ! after -- in comment",
|
||||||
"unexpected-space-after-double-dash-in-comment":
|
"unexpected-space-after-double-dash-in-comment":
|
||||||
_("Unexpected space after -- in comment"),
|
"Unexpected space after -- in comment",
|
||||||
"incorrect-comment":
|
"incorrect-comment":
|
||||||
_("Incorrect comment."),
|
"Incorrect comment.",
|
||||||
"eof-in-comment":
|
"eof-in-comment":
|
||||||
_("Unexpected end of file in comment."),
|
"Unexpected end of file in comment.",
|
||||||
"eof-in-comment-end-dash":
|
"eof-in-comment-end-dash":
|
||||||
_("Unexpected end of file in comment (-)"),
|
"Unexpected end of file in comment (-)",
|
||||||
"unexpected-dash-after-double-dash-in-comment":
|
"unexpected-dash-after-double-dash-in-comment":
|
||||||
_("Unexpected '-' after '--' found in comment."),
|
"Unexpected '-' after '--' found in comment.",
|
||||||
"eof-in-comment-double-dash":
|
"eof-in-comment-double-dash":
|
||||||
_("Unexpected end of file in comment (--)."),
|
"Unexpected end of file in comment (--).",
|
||||||
"eof-in-comment-end-space-state":
|
"eof-in-comment-end-space-state":
|
||||||
_("Unexpected end of file in comment."),
|
"Unexpected end of file in comment.",
|
||||||
"eof-in-comment-end-bang-state":
|
"eof-in-comment-end-bang-state":
|
||||||
_("Unexpected end of file in comment."),
|
"Unexpected end of file in comment.",
|
||||||
"unexpected-char-in-comment":
|
"unexpected-char-in-comment":
|
||||||
_("Unexpected character in comment found."),
|
"Unexpected character in comment found.",
|
||||||
"need-space-after-doctype":
|
"need-space-after-doctype":
|
||||||
_("No space after literal string 'DOCTYPE'."),
|
"No space after literal string 'DOCTYPE'.",
|
||||||
"expected-doctype-name-but-got-right-bracket":
|
"expected-doctype-name-but-got-right-bracket":
|
||||||
_("Unexpected > character. Expected DOCTYPE name."),
|
"Unexpected > character. Expected DOCTYPE name.",
|
||||||
"expected-doctype-name-but-got-eof":
|
"expected-doctype-name-but-got-eof":
|
||||||
_("Unexpected end of file. Expected DOCTYPE name."),
|
"Unexpected end of file. Expected DOCTYPE name.",
|
||||||
"eof-in-doctype-name":
|
"eof-in-doctype-name":
|
||||||
_("Unexpected end of file in DOCTYPE name."),
|
"Unexpected end of file in DOCTYPE name.",
|
||||||
"eof-in-doctype":
|
"eof-in-doctype":
|
||||||
_("Unexpected end of file in DOCTYPE."),
|
"Unexpected end of file in DOCTYPE.",
|
||||||
"expected-space-or-right-bracket-in-doctype":
|
"expected-space-or-right-bracket-in-doctype":
|
||||||
_("Expected space or '>'. Got '%(data)s'"),
|
"Expected space or '>'. Got '%(data)s'",
|
||||||
"unexpected-end-of-doctype":
|
"unexpected-end-of-doctype":
|
||||||
_("Unexpected end of DOCTYPE."),
|
"Unexpected end of DOCTYPE.",
|
||||||
"unexpected-char-in-doctype":
|
"unexpected-char-in-doctype":
|
||||||
_("Unexpected character in DOCTYPE."),
|
"Unexpected character in DOCTYPE.",
|
||||||
"eof-in-innerhtml":
|
"eof-in-innerhtml":
|
||||||
_("XXX innerHTML EOF"),
|
"XXX innerHTML EOF",
|
||||||
"unexpected-doctype":
|
"unexpected-doctype":
|
||||||
_("Unexpected DOCTYPE. Ignored."),
|
"Unexpected DOCTYPE. Ignored.",
|
||||||
"non-html-root":
|
"non-html-root":
|
||||||
_("html needs to be the first start tag."),
|
"html needs to be the first start tag.",
|
||||||
"expected-doctype-but-got-eof":
|
"expected-doctype-but-got-eof":
|
||||||
_("Unexpected End of file. Expected DOCTYPE."),
|
"Unexpected End of file. Expected DOCTYPE.",
|
||||||
"unknown-doctype":
|
"unknown-doctype":
|
||||||
_("Erroneous DOCTYPE."),
|
"Erroneous DOCTYPE.",
|
||||||
"expected-doctype-but-got-chars":
|
"expected-doctype-but-got-chars":
|
||||||
_("Unexpected non-space characters. Expected DOCTYPE."),
|
"Unexpected non-space characters. Expected DOCTYPE.",
|
||||||
"expected-doctype-but-got-start-tag":
|
"expected-doctype-but-got-start-tag":
|
||||||
_("Unexpected start tag (%(name)s). Expected DOCTYPE."),
|
"Unexpected start tag (%(name)s). Expected DOCTYPE.",
|
||||||
"expected-doctype-but-got-end-tag":
|
"expected-doctype-but-got-end-tag":
|
||||||
_("Unexpected end tag (%(name)s). Expected DOCTYPE."),
|
"Unexpected end tag (%(name)s). Expected DOCTYPE.",
|
||||||
"end-tag-after-implied-root":
|
"end-tag-after-implied-root":
|
||||||
_("Unexpected end tag (%(name)s) after the (implied) root element."),
|
"Unexpected end tag (%(name)s) after the (implied) root element.",
|
||||||
"expected-named-closing-tag-but-got-eof":
|
"expected-named-closing-tag-but-got-eof":
|
||||||
_("Unexpected end of file. Expected end tag (%(name)s)."),
|
"Unexpected end of file. Expected end tag (%(name)s).",
|
||||||
"two-heads-are-not-better-than-one":
|
"two-heads-are-not-better-than-one":
|
||||||
_("Unexpected start tag head in existing head. Ignored."),
|
"Unexpected start tag head in existing head. Ignored.",
|
||||||
"unexpected-end-tag":
|
"unexpected-end-tag":
|
||||||
_("Unexpected end tag (%(name)s). Ignored."),
|
"Unexpected end tag (%(name)s). Ignored.",
|
||||||
"unexpected-start-tag-out-of-my-head":
|
"unexpected-start-tag-out-of-my-head":
|
||||||
_("Unexpected start tag (%(name)s) that can be in head. Moved."),
|
"Unexpected start tag (%(name)s) that can be in head. Moved.",
|
||||||
"unexpected-start-tag":
|
"unexpected-start-tag":
|
||||||
_("Unexpected start tag (%(name)s)."),
|
"Unexpected start tag (%(name)s).",
|
||||||
"missing-end-tag":
|
"missing-end-tag":
|
||||||
_("Missing end tag (%(name)s)."),
|
"Missing end tag (%(name)s).",
|
||||||
"missing-end-tags":
|
"missing-end-tags":
|
||||||
_("Missing end tags (%(name)s)."),
|
"Missing end tags (%(name)s).",
|
||||||
"unexpected-start-tag-implies-end-tag":
|
"unexpected-start-tag-implies-end-tag":
|
||||||
_("Unexpected start tag (%(startName)s) "
|
"Unexpected start tag (%(startName)s) "
|
||||||
"implies end tag (%(endName)s)."),
|
"implies end tag (%(endName)s).",
|
||||||
"unexpected-start-tag-treated-as":
|
"unexpected-start-tag-treated-as":
|
||||||
_("Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
|
"Unexpected start tag (%(originalName)s). Treated as %(newName)s.",
|
||||||
"deprecated-tag":
|
"deprecated-tag":
|
||||||
_("Unexpected start tag %(name)s. Don't use it!"),
|
"Unexpected start tag %(name)s. Don't use it!",
|
||||||
"unexpected-start-tag-ignored":
|
"unexpected-start-tag-ignored":
|
||||||
_("Unexpected start tag %(name)s. Ignored."),
|
"Unexpected start tag %(name)s. Ignored.",
|
||||||
"expected-one-end-tag-but-got-another":
|
"expected-one-end-tag-but-got-another":
|
||||||
_("Unexpected end tag (%(gotName)s). "
|
"Unexpected end tag (%(gotName)s). "
|
||||||
"Missing end tag (%(expectedName)s)."),
|
"Missing end tag (%(expectedName)s).",
|
||||||
"end-tag-too-early":
|
"end-tag-too-early":
|
||||||
_("End tag (%(name)s) seen too early. Expected other end tag."),
|
"End tag (%(name)s) seen too early. Expected other end tag.",
|
||||||
"end-tag-too-early-named":
|
"end-tag-too-early-named":
|
||||||
_("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
|
"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s).",
|
||||||
"end-tag-too-early-ignored":
|
"end-tag-too-early-ignored":
|
||||||
_("End tag (%(name)s) seen too early. Ignored."),
|
"End tag (%(name)s) seen too early. Ignored.",
|
||||||
"adoption-agency-1.1":
|
"adoption-agency-1.1":
|
||||||
_("End tag (%(name)s) violates step 1, "
|
"End tag (%(name)s) violates step 1, "
|
||||||
"paragraph 1 of the adoption agency algorithm."),
|
"paragraph 1 of the adoption agency algorithm.",
|
||||||
"adoption-agency-1.2":
|
"adoption-agency-1.2":
|
||||||
_("End tag (%(name)s) violates step 1, "
|
"End tag (%(name)s) violates step 1, "
|
||||||
"paragraph 2 of the adoption agency algorithm."),
|
"paragraph 2 of the adoption agency algorithm.",
|
||||||
"adoption-agency-1.3":
|
"adoption-agency-1.3":
|
||||||
_("End tag (%(name)s) violates step 1, "
|
"End tag (%(name)s) violates step 1, "
|
||||||
"paragraph 3 of the adoption agency algorithm."),
|
"paragraph 3 of the adoption agency algorithm.",
|
||||||
"adoption-agency-4.4":
|
"adoption-agency-4.4":
|
||||||
_("End tag (%(name)s) violates step 4, "
|
"End tag (%(name)s) violates step 4, "
|
||||||
"paragraph 4 of the adoption agency algorithm."),
|
"paragraph 4 of the adoption agency algorithm.",
|
||||||
"unexpected-end-tag-treated-as":
|
"unexpected-end-tag-treated-as":
|
||||||
_("Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
|
"Unexpected end tag (%(originalName)s). Treated as %(newName)s.",
|
||||||
"no-end-tag":
|
"no-end-tag":
|
||||||
_("This element (%(name)s) has no end tag."),
|
"This element (%(name)s) has no end tag.",
|
||||||
"unexpected-implied-end-tag-in-table":
|
"unexpected-implied-end-tag-in-table":
|
||||||
_("Unexpected implied end tag (%(name)s) in the table phase."),
|
"Unexpected implied end tag (%(name)s) in the table phase.",
|
||||||
"unexpected-implied-end-tag-in-table-body":
|
"unexpected-implied-end-tag-in-table-body":
|
||||||
_("Unexpected implied end tag (%(name)s) in the table body phase."),
|
"Unexpected implied end tag (%(name)s) in the table body phase.",
|
||||||
"unexpected-char-implies-table-voodoo":
|
"unexpected-char-implies-table-voodoo":
|
||||||
_("Unexpected non-space characters in "
|
"Unexpected non-space characters in "
|
||||||
"table context caused voodoo mode."),
|
"table context caused voodoo mode.",
|
||||||
"unexpected-hidden-input-in-table":
|
"unexpected-hidden-input-in-table":
|
||||||
_("Unexpected input with type hidden in table context."),
|
"Unexpected input with type hidden in table context.",
|
||||||
"unexpected-form-in-table":
|
"unexpected-form-in-table":
|
||||||
_("Unexpected form in table context."),
|
"Unexpected form in table context.",
|
||||||
"unexpected-start-tag-implies-table-voodoo":
|
"unexpected-start-tag-implies-table-voodoo":
|
||||||
_("Unexpected start tag (%(name)s) in "
|
"Unexpected start tag (%(name)s) in "
|
||||||
"table context caused voodoo mode."),
|
"table context caused voodoo mode.",
|
||||||
"unexpected-end-tag-implies-table-voodoo":
|
"unexpected-end-tag-implies-table-voodoo":
|
||||||
_("Unexpected end tag (%(name)s) in "
|
"Unexpected end tag (%(name)s) in "
|
||||||
"table context caused voodoo mode."),
|
"table context caused voodoo mode.",
|
||||||
"unexpected-cell-in-table-body":
|
"unexpected-cell-in-table-body":
|
||||||
_("Unexpected table cell start tag (%(name)s) "
|
"Unexpected table cell start tag (%(name)s) "
|
||||||
"in the table body phase."),
|
"in the table body phase.",
|
||||||
"unexpected-cell-end-tag":
|
"unexpected-cell-end-tag":
|
||||||
_("Got table cell end tag (%(name)s) "
|
"Got table cell end tag (%(name)s) "
|
||||||
"while required end tags are missing."),
|
"while required end tags are missing.",
|
||||||
"unexpected-end-tag-in-table-body":
|
"unexpected-end-tag-in-table-body":
|
||||||
_("Unexpected end tag (%(name)s) in the table body phase. Ignored."),
|
"Unexpected end tag (%(name)s) in the table body phase. Ignored.",
|
||||||
"unexpected-implied-end-tag-in-table-row":
|
"unexpected-implied-end-tag-in-table-row":
|
||||||
_("Unexpected implied end tag (%(name)s) in the table row phase."),
|
"Unexpected implied end tag (%(name)s) in the table row phase.",
|
||||||
"unexpected-end-tag-in-table-row":
|
"unexpected-end-tag-in-table-row":
|
||||||
_("Unexpected end tag (%(name)s) in the table row phase. Ignored."),
|
"Unexpected end tag (%(name)s) in the table row phase. Ignored.",
|
||||||
"unexpected-select-in-select":
|
"unexpected-select-in-select":
|
||||||
_("Unexpected select start tag in the select phase "
|
"Unexpected select start tag in the select phase "
|
||||||
"treated as select end tag."),
|
"treated as select end tag.",
|
||||||
"unexpected-input-in-select":
|
"unexpected-input-in-select":
|
||||||
_("Unexpected input start tag in the select phase."),
|
"Unexpected input start tag in the select phase.",
|
||||||
"unexpected-start-tag-in-select":
|
"unexpected-start-tag-in-select":
|
||||||
_("Unexpected start tag token (%(name)s in the select phase. "
|
"Unexpected start tag token (%(name)s in the select phase. "
|
||||||
"Ignored."),
|
"Ignored.",
|
||||||
"unexpected-end-tag-in-select":
|
"unexpected-end-tag-in-select":
|
||||||
_("Unexpected end tag (%(name)s) in the select phase. Ignored."),
|
"Unexpected end tag (%(name)s) in the select phase. Ignored.",
|
||||||
"unexpected-table-element-start-tag-in-select-in-table":
|
"unexpected-table-element-start-tag-in-select-in-table":
|
||||||
_("Unexpected table element start tag (%(name)s) in the select in table phase."),
|
"Unexpected table element start tag (%(name)s) in the select in table phase.",
|
||||||
"unexpected-table-element-end-tag-in-select-in-table":
|
"unexpected-table-element-end-tag-in-select-in-table":
|
||||||
_("Unexpected table element end tag (%(name)s) in the select in table phase."),
|
"Unexpected table element end tag (%(name)s) in the select in table phase.",
|
||||||
"unexpected-char-after-body":
|
"unexpected-char-after-body":
|
||||||
_("Unexpected non-space characters in the after body phase."),
|
"Unexpected non-space characters in the after body phase.",
|
||||||
"unexpected-start-tag-after-body":
|
"unexpected-start-tag-after-body":
|
||||||
_("Unexpected start tag token (%(name)s)"
|
"Unexpected start tag token (%(name)s)"
|
||||||
" in the after body phase."),
|
" in the after body phase.",
|
||||||
"unexpected-end-tag-after-body":
|
"unexpected-end-tag-after-body":
|
||||||
_("Unexpected end tag token (%(name)s)"
|
"Unexpected end tag token (%(name)s)"
|
||||||
" in the after body phase."),
|
" in the after body phase.",
|
||||||
"unexpected-char-in-frameset":
|
"unexpected-char-in-frameset":
|
||||||
_("Unexpected characters in the frameset phase. Characters ignored."),
|
"Unexpected characters in the frameset phase. Characters ignored.",
|
||||||
"unexpected-start-tag-in-frameset":
|
"unexpected-start-tag-in-frameset":
|
||||||
_("Unexpected start tag token (%(name)s)"
|
"Unexpected start tag token (%(name)s)"
|
||||||
" in the frameset phase. Ignored."),
|
" in the frameset phase. Ignored.",
|
||||||
"unexpected-frameset-in-frameset-innerhtml":
|
"unexpected-frameset-in-frameset-innerhtml":
|
||||||
_("Unexpected end tag token (frameset) "
|
"Unexpected end tag token (frameset) "
|
||||||
"in the frameset phase (innerHTML)."),
|
"in the frameset phase (innerHTML).",
|
||||||
"unexpected-end-tag-in-frameset":
|
"unexpected-end-tag-in-frameset":
|
||||||
_("Unexpected end tag token (%(name)s)"
|
"Unexpected end tag token (%(name)s)"
|
||||||
" in the frameset phase. Ignored."),
|
" in the frameset phase. Ignored.",
|
||||||
"unexpected-char-after-frameset":
|
"unexpected-char-after-frameset":
|
||||||
_("Unexpected non-space characters in the "
|
"Unexpected non-space characters in the "
|
||||||
"after frameset phase. Ignored."),
|
"after frameset phase. Ignored.",
|
||||||
"unexpected-start-tag-after-frameset":
|
"unexpected-start-tag-after-frameset":
|
||||||
_("Unexpected start tag (%(name)s)"
|
"Unexpected start tag (%(name)s)"
|
||||||
" in the after frameset phase. Ignored."),
|
" in the after frameset phase. Ignored.",
|
||||||
"unexpected-end-tag-after-frameset":
|
"unexpected-end-tag-after-frameset":
|
||||||
_("Unexpected end tag (%(name)s)"
|
"Unexpected end tag (%(name)s)"
|
||||||
" in the after frameset phase. Ignored."),
|
" in the after frameset phase. Ignored.",
|
||||||
"unexpected-end-tag-after-body-innerhtml":
|
"unexpected-end-tag-after-body-innerhtml":
|
||||||
_("Unexpected end tag after body(innerHtml)"),
|
"Unexpected end tag after body(innerHtml)",
|
||||||
"expected-eof-but-got-char":
|
"expected-eof-but-got-char":
|
||||||
_("Unexpected non-space characters. Expected end of file."),
|
"Unexpected non-space characters. Expected end of file.",
|
||||||
"expected-eof-but-got-start-tag":
|
"expected-eof-but-got-start-tag":
|
||||||
_("Unexpected start tag (%(name)s)"
|
"Unexpected start tag (%(name)s)"
|
||||||
". Expected end of file."),
|
". Expected end of file.",
|
||||||
"expected-eof-but-got-end-tag":
|
"expected-eof-but-got-end-tag":
|
||||||
_("Unexpected end tag (%(name)s)"
|
"Unexpected end tag (%(name)s)"
|
||||||
". Expected end of file."),
|
". Expected end of file.",
|
||||||
"eof-in-table":
|
"eof-in-table":
|
||||||
_("Unexpected end of file. Expected table content."),
|
"Unexpected end of file. Expected table content.",
|
||||||
"eof-in-select":
|
"eof-in-select":
|
||||||
_("Unexpected end of file. Expected select content."),
|
"Unexpected end of file. Expected select content.",
|
||||||
"eof-in-frameset":
|
"eof-in-frameset":
|
||||||
_("Unexpected end of file. Expected frameset content."),
|
"Unexpected end of file. Expected frameset content.",
|
||||||
"eof-in-script-in-script":
|
"eof-in-script-in-script":
|
||||||
_("Unexpected end of file. Expected script content."),
|
"Unexpected end of file. Expected script content.",
|
||||||
"eof-in-foreign-lands":
|
"eof-in-foreign-lands":
|
||||||
_("Unexpected end of file. Expected foreign content"),
|
"Unexpected end of file. Expected foreign content",
|
||||||
"non-void-element-with-trailing-solidus":
|
"non-void-element-with-trailing-solidus":
|
||||||
_("Trailing solidus not allowed on element %(name)s"),
|
"Trailing solidus not allowed on element %(name)s",
|
||||||
"unexpected-html-element-in-foreign-content":
|
"unexpected-html-element-in-foreign-content":
|
||||||
_("Element %(name)s not allowed in a non-html context"),
|
"Element %(name)s not allowed in a non-html context",
|
||||||
"unexpected-end-tag-before-html":
|
"unexpected-end-tag-before-html":
|
||||||
_("Unexpected end tag (%(name)s) before html."),
|
"Unexpected end tag (%(name)s) before html.",
|
||||||
|
"unexpected-inhead-noscript-tag":
|
||||||
|
"Element %(name)s not allowed in a inhead-noscript context",
|
||||||
|
"eof-in-head-noscript":
|
||||||
|
"Unexpected end of file. Expected inhead-noscript content",
|
||||||
|
"char-in-head-noscript":
|
||||||
|
"Unexpected non-space character. Expected inhead-noscript content",
|
||||||
"XXX-undefined-error":
|
"XXX-undefined-error":
|
||||||
_("Undefined error (this sucks and should be fixed)"),
|
"Undefined error (this sucks and should be fixed)",
|
||||||
}
|
}
|
||||||
|
|
||||||
namespaces = {
|
namespaces = {
|
||||||
|
@ -298,7 +302,7 @@ namespaces = {
|
||||||
"xmlns": "http://www.w3.org/2000/xmlns/"
|
"xmlns": "http://www.w3.org/2000/xmlns/"
|
||||||
}
|
}
|
||||||
|
|
||||||
scopingElements = frozenset((
|
scopingElements = frozenset([
|
||||||
(namespaces["html"], "applet"),
|
(namespaces["html"], "applet"),
|
||||||
(namespaces["html"], "caption"),
|
(namespaces["html"], "caption"),
|
||||||
(namespaces["html"], "html"),
|
(namespaces["html"], "html"),
|
||||||
|
@ -316,9 +320,9 @@ scopingElements = frozenset((
|
||||||
(namespaces["svg"], "foreignObject"),
|
(namespaces["svg"], "foreignObject"),
|
||||||
(namespaces["svg"], "desc"),
|
(namespaces["svg"], "desc"),
|
||||||
(namespaces["svg"], "title"),
|
(namespaces["svg"], "title"),
|
||||||
))
|
])
|
||||||
|
|
||||||
formattingElements = frozenset((
|
formattingElements = frozenset([
|
||||||
(namespaces["html"], "a"),
|
(namespaces["html"], "a"),
|
||||||
(namespaces["html"], "b"),
|
(namespaces["html"], "b"),
|
||||||
(namespaces["html"], "big"),
|
(namespaces["html"], "big"),
|
||||||
|
@ -333,9 +337,9 @@ formattingElements = frozenset((
|
||||||
(namespaces["html"], "strong"),
|
(namespaces["html"], "strong"),
|
||||||
(namespaces["html"], "tt"),
|
(namespaces["html"], "tt"),
|
||||||
(namespaces["html"], "u")
|
(namespaces["html"], "u")
|
||||||
))
|
])
|
||||||
|
|
||||||
specialElements = frozenset((
|
specialElements = frozenset([
|
||||||
(namespaces["html"], "address"),
|
(namespaces["html"], "address"),
|
||||||
(namespaces["html"], "applet"),
|
(namespaces["html"], "applet"),
|
||||||
(namespaces["html"], "area"),
|
(namespaces["html"], "area"),
|
||||||
|
@ -416,22 +420,89 @@ specialElements = frozenset((
|
||||||
(namespaces["html"], "wbr"),
|
(namespaces["html"], "wbr"),
|
||||||
(namespaces["html"], "xmp"),
|
(namespaces["html"], "xmp"),
|
||||||
(namespaces["svg"], "foreignObject")
|
(namespaces["svg"], "foreignObject")
|
||||||
))
|
])
|
||||||
|
|
||||||
htmlIntegrationPointElements = frozenset((
|
htmlIntegrationPointElements = frozenset([
|
||||||
(namespaces["mathml"], "annotaion-xml"),
|
(namespaces["mathml"], "annotation-xml"),
|
||||||
(namespaces["svg"], "foreignObject"),
|
(namespaces["svg"], "foreignObject"),
|
||||||
(namespaces["svg"], "desc"),
|
(namespaces["svg"], "desc"),
|
||||||
(namespaces["svg"], "title")
|
(namespaces["svg"], "title")
|
||||||
))
|
])
|
||||||
|
|
||||||
mathmlTextIntegrationPointElements = frozenset((
|
mathmlTextIntegrationPointElements = frozenset([
|
||||||
(namespaces["mathml"], "mi"),
|
(namespaces["mathml"], "mi"),
|
||||||
(namespaces["mathml"], "mo"),
|
(namespaces["mathml"], "mo"),
|
||||||
(namespaces["mathml"], "mn"),
|
(namespaces["mathml"], "mn"),
|
||||||
(namespaces["mathml"], "ms"),
|
(namespaces["mathml"], "ms"),
|
||||||
(namespaces["mathml"], "mtext")
|
(namespaces["mathml"], "mtext")
|
||||||
))
|
])
|
||||||
|
|
||||||
|
adjustSVGAttributes = {
|
||||||
|
"attributename": "attributeName",
|
||||||
|
"attributetype": "attributeType",
|
||||||
|
"basefrequency": "baseFrequency",
|
||||||
|
"baseprofile": "baseProfile",
|
||||||
|
"calcmode": "calcMode",
|
||||||
|
"clippathunits": "clipPathUnits",
|
||||||
|
"contentscripttype": "contentScriptType",
|
||||||
|
"contentstyletype": "contentStyleType",
|
||||||
|
"diffuseconstant": "diffuseConstant",
|
||||||
|
"edgemode": "edgeMode",
|
||||||
|
"externalresourcesrequired": "externalResourcesRequired",
|
||||||
|
"filterres": "filterRes",
|
||||||
|
"filterunits": "filterUnits",
|
||||||
|
"glyphref": "glyphRef",
|
||||||
|
"gradienttransform": "gradientTransform",
|
||||||
|
"gradientunits": "gradientUnits",
|
||||||
|
"kernelmatrix": "kernelMatrix",
|
||||||
|
"kernelunitlength": "kernelUnitLength",
|
||||||
|
"keypoints": "keyPoints",
|
||||||
|
"keysplines": "keySplines",
|
||||||
|
"keytimes": "keyTimes",
|
||||||
|
"lengthadjust": "lengthAdjust",
|
||||||
|
"limitingconeangle": "limitingConeAngle",
|
||||||
|
"markerheight": "markerHeight",
|
||||||
|
"markerunits": "markerUnits",
|
||||||
|
"markerwidth": "markerWidth",
|
||||||
|
"maskcontentunits": "maskContentUnits",
|
||||||
|
"maskunits": "maskUnits",
|
||||||
|
"numoctaves": "numOctaves",
|
||||||
|
"pathlength": "pathLength",
|
||||||
|
"patterncontentunits": "patternContentUnits",
|
||||||
|
"patterntransform": "patternTransform",
|
||||||
|
"patternunits": "patternUnits",
|
||||||
|
"pointsatx": "pointsAtX",
|
||||||
|
"pointsaty": "pointsAtY",
|
||||||
|
"pointsatz": "pointsAtZ",
|
||||||
|
"preservealpha": "preserveAlpha",
|
||||||
|
"preserveaspectratio": "preserveAspectRatio",
|
||||||
|
"primitiveunits": "primitiveUnits",
|
||||||
|
"refx": "refX",
|
||||||
|
"refy": "refY",
|
||||||
|
"repeatcount": "repeatCount",
|
||||||
|
"repeatdur": "repeatDur",
|
||||||
|
"requiredextensions": "requiredExtensions",
|
||||||
|
"requiredfeatures": "requiredFeatures",
|
||||||
|
"specularconstant": "specularConstant",
|
||||||
|
"specularexponent": "specularExponent",
|
||||||
|
"spreadmethod": "spreadMethod",
|
||||||
|
"startoffset": "startOffset",
|
||||||
|
"stddeviation": "stdDeviation",
|
||||||
|
"stitchtiles": "stitchTiles",
|
||||||
|
"surfacescale": "surfaceScale",
|
||||||
|
"systemlanguage": "systemLanguage",
|
||||||
|
"tablevalues": "tableValues",
|
||||||
|
"targetx": "targetX",
|
||||||
|
"targety": "targetY",
|
||||||
|
"textlength": "textLength",
|
||||||
|
"viewbox": "viewBox",
|
||||||
|
"viewtarget": "viewTarget",
|
||||||
|
"xchannelselector": "xChannelSelector",
|
||||||
|
"ychannelselector": "yChannelSelector",
|
||||||
|
"zoomandpan": "zoomAndPan"
|
||||||
|
}
|
||||||
|
|
||||||
|
adjustMathMLAttributes = {"definitionurl": "definitionURL"}
|
||||||
|
|
||||||
adjustForeignAttributes = {
|
adjustForeignAttributes = {
|
||||||
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
|
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
|
||||||
|
@ -451,21 +522,21 @@ adjustForeignAttributes = {
|
||||||
unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
|
unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
|
||||||
adjustForeignAttributes.items()])
|
adjustForeignAttributes.items()])
|
||||||
|
|
||||||
spaceCharacters = frozenset((
|
spaceCharacters = frozenset([
|
||||||
"\t",
|
"\t",
|
||||||
"\n",
|
"\n",
|
||||||
"\u000C",
|
"\u000C",
|
||||||
" ",
|
" ",
|
||||||
"\r"
|
"\r"
|
||||||
))
|
])
|
||||||
|
|
||||||
tableInsertModeElements = frozenset((
|
tableInsertModeElements = frozenset([
|
||||||
"table",
|
"table",
|
||||||
"tbody",
|
"tbody",
|
||||||
"tfoot",
|
"tfoot",
|
||||||
"thead",
|
"thead",
|
||||||
"tr"
|
"tr"
|
||||||
))
|
])
|
||||||
|
|
||||||
asciiLowercase = frozenset(string.ascii_lowercase)
|
asciiLowercase = frozenset(string.ascii_lowercase)
|
||||||
asciiUppercase = frozenset(string.ascii_uppercase)
|
asciiUppercase = frozenset(string.ascii_uppercase)
|
||||||
|
@ -486,7 +557,7 @@ headingElements = (
|
||||||
"h6"
|
"h6"
|
||||||
)
|
)
|
||||||
|
|
||||||
voidElements = frozenset((
|
voidElements = frozenset([
|
||||||
"base",
|
"base",
|
||||||
"command",
|
"command",
|
||||||
"event-source",
|
"event-source",
|
||||||
|
@ -502,11 +573,11 @@ voidElements = frozenset((
|
||||||
"input",
|
"input",
|
||||||
"source",
|
"source",
|
||||||
"track"
|
"track"
|
||||||
))
|
])
|
||||||
|
|
||||||
cdataElements = frozenset(('title', 'textarea'))
|
cdataElements = frozenset(['title', 'textarea'])
|
||||||
|
|
||||||
rcdataElements = frozenset((
|
rcdataElements = frozenset([
|
||||||
'style',
|
'style',
|
||||||
'script',
|
'script',
|
||||||
'xmp',
|
'xmp',
|
||||||
|
@ -514,27 +585,28 @@ rcdataElements = frozenset((
|
||||||
'noembed',
|
'noembed',
|
||||||
'noframes',
|
'noframes',
|
||||||
'noscript'
|
'noscript'
|
||||||
))
|
])
|
||||||
|
|
||||||
booleanAttributes = {
|
booleanAttributes = {
|
||||||
"": frozenset(("irrelevant",)),
|
"": frozenset(["irrelevant", "itemscope"]),
|
||||||
"style": frozenset(("scoped",)),
|
"style": frozenset(["scoped"]),
|
||||||
"img": frozenset(("ismap",)),
|
"img": frozenset(["ismap"]),
|
||||||
"audio": frozenset(("autoplay", "controls")),
|
"audio": frozenset(["autoplay", "controls"]),
|
||||||
"video": frozenset(("autoplay", "controls")),
|
"video": frozenset(["autoplay", "controls"]),
|
||||||
"script": frozenset(("defer", "async")),
|
"script": frozenset(["defer", "async"]),
|
||||||
"details": frozenset(("open",)),
|
"details": frozenset(["open"]),
|
||||||
"datagrid": frozenset(("multiple", "disabled")),
|
"datagrid": frozenset(["multiple", "disabled"]),
|
||||||
"command": frozenset(("hidden", "disabled", "checked", "default")),
|
"command": frozenset(["hidden", "disabled", "checked", "default"]),
|
||||||
"hr": frozenset(("noshade")),
|
"hr": frozenset(["noshade"]),
|
||||||
"menu": frozenset(("autosubmit",)),
|
"menu": frozenset(["autosubmit"]),
|
||||||
"fieldset": frozenset(("disabled", "readonly")),
|
"fieldset": frozenset(["disabled", "readonly"]),
|
||||||
"option": frozenset(("disabled", "readonly", "selected")),
|
"option": frozenset(["disabled", "readonly", "selected"]),
|
||||||
"optgroup": frozenset(("disabled", "readonly")),
|
"optgroup": frozenset(["disabled", "readonly"]),
|
||||||
"button": frozenset(("disabled", "autofocus")),
|
"button": frozenset(["disabled", "autofocus"]),
|
||||||
"input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")),
|
"input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
|
||||||
"select": frozenset(("disabled", "readonly", "autofocus", "multiple")),
|
"select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
|
||||||
"output": frozenset(("disabled", "readonly")),
|
"output": frozenset(["disabled", "readonly"]),
|
||||||
|
"iframe": frozenset(["seamless"]),
|
||||||
}
|
}
|
||||||
|
|
||||||
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
|
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
|
||||||
|
@ -574,7 +646,7 @@ entitiesWindows1252 = (
|
||||||
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||||
)
|
)
|
||||||
|
|
||||||
xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;'))
|
xmlEntities = frozenset(['lt;', 'gt;', 'amp;', 'apos;', 'quot;'])
|
||||||
|
|
||||||
entities = {
|
entities = {
|
||||||
"AElig": "\xc6",
|
"AElig": "\xc6",
|
||||||
|
@ -2815,7 +2887,6 @@ replacementCharacters = {
|
||||||
0x0d: "\u000D",
|
0x0d: "\u000D",
|
||||||
0x80: "\u20AC",
|
0x80: "\u20AC",
|
||||||
0x81: "\u0081",
|
0x81: "\u0081",
|
||||||
0x81: "\u0081",
|
|
||||||
0x82: "\u201A",
|
0x82: "\u201A",
|
||||||
0x83: "\u0192",
|
0x83: "\u0192",
|
||||||
0x84: "\u201E",
|
0x84: "\u201E",
|
||||||
|
@ -2848,235 +2919,6 @@ replacementCharacters = {
|
||||||
0x9F: "\u0178",
|
0x9F: "\u0178",
|
||||||
}
|
}
|
||||||
|
|
||||||
encodings = {
|
|
||||||
'437': 'cp437',
|
|
||||||
'850': 'cp850',
|
|
||||||
'852': 'cp852',
|
|
||||||
'855': 'cp855',
|
|
||||||
'857': 'cp857',
|
|
||||||
'860': 'cp860',
|
|
||||||
'861': 'cp861',
|
|
||||||
'862': 'cp862',
|
|
||||||
'863': 'cp863',
|
|
||||||
'865': 'cp865',
|
|
||||||
'866': 'cp866',
|
|
||||||
'869': 'cp869',
|
|
||||||
'ansix341968': 'ascii',
|
|
||||||
'ansix341986': 'ascii',
|
|
||||||
'arabic': 'iso8859-6',
|
|
||||||
'ascii': 'ascii',
|
|
||||||
'asmo708': 'iso8859-6',
|
|
||||||
'big5': 'big5',
|
|
||||||
'big5hkscs': 'big5hkscs',
|
|
||||||
'chinese': 'gbk',
|
|
||||||
'cp037': 'cp037',
|
|
||||||
'cp1026': 'cp1026',
|
|
||||||
'cp154': 'ptcp154',
|
|
||||||
'cp367': 'ascii',
|
|
||||||
'cp424': 'cp424',
|
|
||||||
'cp437': 'cp437',
|
|
||||||
'cp500': 'cp500',
|
|
||||||
'cp775': 'cp775',
|
|
||||||
'cp819': 'windows-1252',
|
|
||||||
'cp850': 'cp850',
|
|
||||||
'cp852': 'cp852',
|
|
||||||
'cp855': 'cp855',
|
|
||||||
'cp857': 'cp857',
|
|
||||||
'cp860': 'cp860',
|
|
||||||
'cp861': 'cp861',
|
|
||||||
'cp862': 'cp862',
|
|
||||||
'cp863': 'cp863',
|
|
||||||
'cp864': 'cp864',
|
|
||||||
'cp865': 'cp865',
|
|
||||||
'cp866': 'cp866',
|
|
||||||
'cp869': 'cp869',
|
|
||||||
'cp936': 'gbk',
|
|
||||||
'cpgr': 'cp869',
|
|
||||||
'cpis': 'cp861',
|
|
||||||
'csascii': 'ascii',
|
|
||||||
'csbig5': 'big5',
|
|
||||||
'cseuckr': 'cp949',
|
|
||||||
'cseucpkdfmtjapanese': 'euc_jp',
|
|
||||||
'csgb2312': 'gbk',
|
|
||||||
'cshproman8': 'hp-roman8',
|
|
||||||
'csibm037': 'cp037',
|
|
||||||
'csibm1026': 'cp1026',
|
|
||||||
'csibm424': 'cp424',
|
|
||||||
'csibm500': 'cp500',
|
|
||||||
'csibm855': 'cp855',
|
|
||||||
'csibm857': 'cp857',
|
|
||||||
'csibm860': 'cp860',
|
|
||||||
'csibm861': 'cp861',
|
|
||||||
'csibm863': 'cp863',
|
|
||||||
'csibm864': 'cp864',
|
|
||||||
'csibm865': 'cp865',
|
|
||||||
'csibm866': 'cp866',
|
|
||||||
'csibm869': 'cp869',
|
|
||||||
'csiso2022jp': 'iso2022_jp',
|
|
||||||
'csiso2022jp2': 'iso2022_jp_2',
|
|
||||||
'csiso2022kr': 'iso2022_kr',
|
|
||||||
'csiso58gb231280': 'gbk',
|
|
||||||
'csisolatin1': 'windows-1252',
|
|
||||||
'csisolatin2': 'iso8859-2',
|
|
||||||
'csisolatin3': 'iso8859-3',
|
|
||||||
'csisolatin4': 'iso8859-4',
|
|
||||||
'csisolatin5': 'windows-1254',
|
|
||||||
'csisolatin6': 'iso8859-10',
|
|
||||||
'csisolatinarabic': 'iso8859-6',
|
|
||||||
'csisolatincyrillic': 'iso8859-5',
|
|
||||||
'csisolatingreek': 'iso8859-7',
|
|
||||||
'csisolatinhebrew': 'iso8859-8',
|
|
||||||
'cskoi8r': 'koi8-r',
|
|
||||||
'csksc56011987': 'cp949',
|
|
||||||
'cspc775baltic': 'cp775',
|
|
||||||
'cspc850multilingual': 'cp850',
|
|
||||||
'cspc862latinhebrew': 'cp862',
|
|
||||||
'cspc8codepage437': 'cp437',
|
|
||||||
'cspcp852': 'cp852',
|
|
||||||
'csptcp154': 'ptcp154',
|
|
||||||
'csshiftjis': 'shift_jis',
|
|
||||||
'csunicode11utf7': 'utf-7',
|
|
||||||
'cyrillic': 'iso8859-5',
|
|
||||||
'cyrillicasian': 'ptcp154',
|
|
||||||
'ebcdiccpbe': 'cp500',
|
|
||||||
'ebcdiccpca': 'cp037',
|
|
||||||
'ebcdiccpch': 'cp500',
|
|
||||||
'ebcdiccphe': 'cp424',
|
|
||||||
'ebcdiccpnl': 'cp037',
|
|
||||||
'ebcdiccpus': 'cp037',
|
|
||||||
'ebcdiccpwt': 'cp037',
|
|
||||||
'ecma114': 'iso8859-6',
|
|
||||||
'ecma118': 'iso8859-7',
|
|
||||||
'elot928': 'iso8859-7',
|
|
||||||
'eucjp': 'euc_jp',
|
|
||||||
'euckr': 'cp949',
|
|
||||||
'extendedunixcodepackedformatforjapanese': 'euc_jp',
|
|
||||||
'gb18030': 'gb18030',
|
|
||||||
'gb2312': 'gbk',
|
|
||||||
'gb231280': 'gbk',
|
|
||||||
'gbk': 'gbk',
|
|
||||||
'greek': 'iso8859-7',
|
|
||||||
'greek8': 'iso8859-7',
|
|
||||||
'hebrew': 'iso8859-8',
|
|
||||||
'hproman8': 'hp-roman8',
|
|
||||||
'hzgb2312': 'hz',
|
|
||||||
'ibm037': 'cp037',
|
|
||||||
'ibm1026': 'cp1026',
|
|
||||||
'ibm367': 'ascii',
|
|
||||||
'ibm424': 'cp424',
|
|
||||||
'ibm437': 'cp437',
|
|
||||||
'ibm500': 'cp500',
|
|
||||||
'ibm775': 'cp775',
|
|
||||||
'ibm819': 'windows-1252',
|
|
||||||
'ibm850': 'cp850',
|
|
||||||
'ibm852': 'cp852',
|
|
||||||
'ibm855': 'cp855',
|
|
||||||
'ibm857': 'cp857',
|
|
||||||
'ibm860': 'cp860',
|
|
||||||
'ibm861': 'cp861',
|
|
||||||
'ibm862': 'cp862',
|
|
||||||
'ibm863': 'cp863',
|
|
||||||
'ibm864': 'cp864',
|
|
||||||
'ibm865': 'cp865',
|
|
||||||
'ibm866': 'cp866',
|
|
||||||
'ibm869': 'cp869',
|
|
||||||
'iso2022jp': 'iso2022_jp',
|
|
||||||
'iso2022jp2': 'iso2022_jp_2',
|
|
||||||
'iso2022kr': 'iso2022_kr',
|
|
||||||
'iso646irv1991': 'ascii',
|
|
||||||
'iso646us': 'ascii',
|
|
||||||
'iso88591': 'windows-1252',
|
|
||||||
'iso885910': 'iso8859-10',
|
|
||||||
'iso8859101992': 'iso8859-10',
|
|
||||||
'iso885911987': 'windows-1252',
|
|
||||||
'iso885913': 'iso8859-13',
|
|
||||||
'iso885914': 'iso8859-14',
|
|
||||||
'iso8859141998': 'iso8859-14',
|
|
||||||
'iso885915': 'iso8859-15',
|
|
||||||
'iso885916': 'iso8859-16',
|
|
||||||
'iso8859162001': 'iso8859-16',
|
|
||||||
'iso88592': 'iso8859-2',
|
|
||||||
'iso885921987': 'iso8859-2',
|
|
||||||
'iso88593': 'iso8859-3',
|
|
||||||
'iso885931988': 'iso8859-3',
|
|
||||||
'iso88594': 'iso8859-4',
|
|
||||||
'iso885941988': 'iso8859-4',
|
|
||||||
'iso88595': 'iso8859-5',
|
|
||||||
'iso885951988': 'iso8859-5',
|
|
||||||
'iso88596': 'iso8859-6',
|
|
||||||
'iso885961987': 'iso8859-6',
|
|
||||||
'iso88597': 'iso8859-7',
|
|
||||||
'iso885971987': 'iso8859-7',
|
|
||||||
'iso88598': 'iso8859-8',
|
|
||||||
'iso885981988': 'iso8859-8',
|
|
||||||
'iso88599': 'windows-1254',
|
|
||||||
'iso885991989': 'windows-1254',
|
|
||||||
'isoceltic': 'iso8859-14',
|
|
||||||
'isoir100': 'windows-1252',
|
|
||||||
'isoir101': 'iso8859-2',
|
|
||||||
'isoir109': 'iso8859-3',
|
|
||||||
'isoir110': 'iso8859-4',
|
|
||||||
'isoir126': 'iso8859-7',
|
|
||||||
'isoir127': 'iso8859-6',
|
|
||||||
'isoir138': 'iso8859-8',
|
|
||||||
'isoir144': 'iso8859-5',
|
|
||||||
'isoir148': 'windows-1254',
|
|
||||||
'isoir149': 'cp949',
|
|
||||||
'isoir157': 'iso8859-10',
|
|
||||||
'isoir199': 'iso8859-14',
|
|
||||||
'isoir226': 'iso8859-16',
|
|
||||||
'isoir58': 'gbk',
|
|
||||||
'isoir6': 'ascii',
|
|
||||||
'koi8r': 'koi8-r',
|
|
||||||
'koi8u': 'koi8-u',
|
|
||||||
'korean': 'cp949',
|
|
||||||
'ksc5601': 'cp949',
|
|
||||||
'ksc56011987': 'cp949',
|
|
||||||
'ksc56011989': 'cp949',
|
|
||||||
'l1': 'windows-1252',
|
|
||||||
'l10': 'iso8859-16',
|
|
||||||
'l2': 'iso8859-2',
|
|
||||||
'l3': 'iso8859-3',
|
|
||||||
'l4': 'iso8859-4',
|
|
||||||
'l5': 'windows-1254',
|
|
||||||
'l6': 'iso8859-10',
|
|
||||||
'l8': 'iso8859-14',
|
|
||||||
'latin1': 'windows-1252',
|
|
||||||
'latin10': 'iso8859-16',
|
|
||||||
'latin2': 'iso8859-2',
|
|
||||||
'latin3': 'iso8859-3',
|
|
||||||
'latin4': 'iso8859-4',
|
|
||||||
'latin5': 'windows-1254',
|
|
||||||
'latin6': 'iso8859-10',
|
|
||||||
'latin8': 'iso8859-14',
|
|
||||||
'latin9': 'iso8859-15',
|
|
||||||
'ms936': 'gbk',
|
|
||||||
'mskanji': 'shift_jis',
|
|
||||||
'pt154': 'ptcp154',
|
|
||||||
'ptcp154': 'ptcp154',
|
|
||||||
'r8': 'hp-roman8',
|
|
||||||
'roman8': 'hp-roman8',
|
|
||||||
'shiftjis': 'shift_jis',
|
|
||||||
'tis620': 'cp874',
|
|
||||||
'unicode11utf7': 'utf-7',
|
|
||||||
'us': 'ascii',
|
|
||||||
'usascii': 'ascii',
|
|
||||||
'utf16': 'utf-16',
|
|
||||||
'utf16be': 'utf-16-be',
|
|
||||||
'utf16le': 'utf-16-le',
|
|
||||||
'utf8': 'utf-8',
|
|
||||||
'windows1250': 'cp1250',
|
|
||||||
'windows1251': 'cp1251',
|
|
||||||
'windows1252': 'cp1252',
|
|
||||||
'windows1253': 'cp1253',
|
|
||||||
'windows1254': 'cp1254',
|
|
||||||
'windows1255': 'cp1255',
|
|
||||||
'windows1256': 'cp1256',
|
|
||||||
'windows1257': 'cp1257',
|
|
||||||
'windows1258': 'cp1258',
|
|
||||||
'windows936': 'gbk',
|
|
||||||
'x-x-big5': 'big5'}
|
|
||||||
|
|
||||||
tokenTypes = {
|
tokenTypes = {
|
||||||
"Doctype": 0,
|
"Doctype": 0,
|
||||||
"Characters": 1,
|
"Characters": 1,
|
||||||
|
@ -3088,8 +2930,8 @@ tokenTypes = {
|
||||||
"ParseError": 7
|
"ParseError": 7
|
||||||
}
|
}
|
||||||
|
|
||||||
tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"],
|
tagTokenTypes = frozenset([tokenTypes["StartTag"], tokenTypes["EndTag"],
|
||||||
tokenTypes["EmptyTag"]))
|
tokenTypes["EmptyTag"]])
|
||||||
|
|
||||||
|
|
||||||
prefixes = dict([(v, k) for k, v in namespaces.items()])
|
prefixes = dict([(v, k) for k, v in namespaces.items()])
|
||||||
|
@ -3097,8 +2939,9 @@ prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
|
||||||
|
|
||||||
|
|
||||||
class DataLossWarning(UserWarning):
|
class DataLossWarning(UserWarning):
|
||||||
|
"""Raised when the current tree is unable to represent the input data"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ReparseException(Exception):
|
class _ReparseException(Exception):
|
||||||
pass
|
pass
|
29
testing/web-platform/tests/tools/third_party/html5lib/html5lib/filters/alphabeticalattributes.py
поставляемый
Normal file
29
testing/web-platform/tests/tools/third_party/html5lib/html5lib/filters/alphabeticalattributes.py
поставляемый
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from . import base
|
||||||
|
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
|
||||||
|
def _attr_key(attr):
|
||||||
|
"""Return an appropriate key for an attribute for sorting
|
||||||
|
|
||||||
|
Attributes have a namespace that can be either ``None`` or a string. We
|
||||||
|
can't compare the two because they're different types, so we convert
|
||||||
|
``None`` to an empty string first.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return (attr[0][0] or ''), attr[0][1]
|
||||||
|
|
||||||
|
|
||||||
|
class Filter(base.Filter):
|
||||||
|
"""Alphabetizes attributes for elements"""
|
||||||
|
def __iter__(self):
|
||||||
|
for token in base.Filter.__iter__(self):
|
||||||
|
if token["type"] in ("StartTag", "EmptyTag"):
|
||||||
|
attrs = OrderedDict()
|
||||||
|
for name, value in sorted(token["data"].items(),
|
||||||
|
key=_attr_key):
|
||||||
|
attrs[name] = value
|
||||||
|
token["data"] = attrs
|
||||||
|
yield token
|
|
@ -1,11 +1,19 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
from . import _base
|
from . import base
|
||||||
|
|
||||||
|
|
||||||
class Filter(_base.Filter):
|
class Filter(base.Filter):
|
||||||
|
"""Injects ``<meta charset=ENCODING>`` tag into head of document"""
|
||||||
def __init__(self, source, encoding):
|
def __init__(self, source, encoding):
|
||||||
_base.Filter.__init__(self, source)
|
"""Creates a Filter
|
||||||
|
|
||||||
|
:arg source: the source token stream
|
||||||
|
|
||||||
|
:arg encoding: the encoding to set
|
||||||
|
|
||||||
|
"""
|
||||||
|
base.Filter.__init__(self, source)
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
@ -13,7 +21,7 @@ class Filter(_base.Filter):
|
||||||
meta_found = (self.encoding is None)
|
meta_found = (self.encoding is None)
|
||||||
pending = []
|
pending = []
|
||||||
|
|
||||||
for token in _base.Filter.__iter__(self):
|
for token in base.Filter.__iter__(self):
|
||||||
type = token["type"]
|
type = token["type"]
|
||||||
if type == "StartTag":
|
if type == "StartTag":
|
||||||
if token["name"].lower() == "head":
|
if token["name"].lower() == "head":
|
93
testing/web-platform/tests/tools/third_party/html5lib/html5lib/filters/lint.py
поставляемый
Normal file
93
testing/web-platform/tests/tools/third_party/html5lib/html5lib/filters/lint.py
поставляемый
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from six import text_type
|
||||||
|
|
||||||
|
from . import base
|
||||||
|
from ..constants import namespaces, voidElements
|
||||||
|
|
||||||
|
from ..constants import spaceCharacters
|
||||||
|
spaceCharacters = "".join(spaceCharacters)
|
||||||
|
|
||||||
|
|
||||||
|
class Filter(base.Filter):
|
||||||
|
"""Lints the token stream for errors
|
||||||
|
|
||||||
|
If it finds any errors, it'll raise an ``AssertionError``.
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, source, require_matching_tags=True):
|
||||||
|
"""Creates a Filter
|
||||||
|
|
||||||
|
:arg source: the source token stream
|
||||||
|
|
||||||
|
:arg require_matching_tags: whether or not to require matching tags
|
||||||
|
|
||||||
|
"""
|
||||||
|
super(Filter, self).__init__(source)
|
||||||
|
self.require_matching_tags = require_matching_tags
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
open_elements = []
|
||||||
|
for token in base.Filter.__iter__(self):
|
||||||
|
type = token["type"]
|
||||||
|
if type in ("StartTag", "EmptyTag"):
|
||||||
|
namespace = token["namespace"]
|
||||||
|
name = token["name"]
|
||||||
|
assert namespace is None or isinstance(namespace, text_type)
|
||||||
|
assert namespace != ""
|
||||||
|
assert isinstance(name, text_type)
|
||||||
|
assert name != ""
|
||||||
|
assert isinstance(token["data"], dict)
|
||||||
|
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
|
||||||
|
assert type == "EmptyTag"
|
||||||
|
else:
|
||||||
|
assert type == "StartTag"
|
||||||
|
if type == "StartTag" and self.require_matching_tags:
|
||||||
|
open_elements.append((namespace, name))
|
||||||
|
for (namespace, name), value in token["data"].items():
|
||||||
|
assert namespace is None or isinstance(namespace, text_type)
|
||||||
|
assert namespace != ""
|
||||||
|
assert isinstance(name, text_type)
|
||||||
|
assert name != ""
|
||||||
|
assert isinstance(value, text_type)
|
||||||
|
|
||||||
|
elif type == "EndTag":
|
||||||
|
namespace = token["namespace"]
|
||||||
|
name = token["name"]
|
||||||
|
assert namespace is None or isinstance(namespace, text_type)
|
||||||
|
assert namespace != ""
|
||||||
|
assert isinstance(name, text_type)
|
||||||
|
assert name != ""
|
||||||
|
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
|
||||||
|
assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
|
||||||
|
elif self.require_matching_tags:
|
||||||
|
start = open_elements.pop()
|
||||||
|
assert start == (namespace, name)
|
||||||
|
|
||||||
|
elif type == "Comment":
|
||||||
|
data = token["data"]
|
||||||
|
assert isinstance(data, text_type)
|
||||||
|
|
||||||
|
elif type in ("Characters", "SpaceCharacters"):
|
||||||
|
data = token["data"]
|
||||||
|
assert isinstance(data, text_type)
|
||||||
|
assert data != ""
|
||||||
|
if type == "SpaceCharacters":
|
||||||
|
assert data.strip(spaceCharacters) == ""
|
||||||
|
|
||||||
|
elif type == "Doctype":
|
||||||
|
name = token["name"]
|
||||||
|
assert name is None or isinstance(name, text_type)
|
||||||
|
assert token["publicId"] is None or isinstance(name, text_type)
|
||||||
|
assert token["systemId"] is None or isinstance(name, text_type)
|
||||||
|
|
||||||
|
elif type == "Entity":
|
||||||
|
assert isinstance(token["name"], text_type)
|
||||||
|
|
||||||
|
elif type == "SerializerError":
|
||||||
|
assert isinstance(token["data"], text_type)
|
||||||
|
|
||||||
|
else:
|
||||||
|
assert False, "Unknown token type: %(type)s" % {"type": type}
|
||||||
|
|
||||||
|
yield token
|
|
@ -1,9 +1,10 @@
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
from . import _base
|
from . import base
|
||||||
|
|
||||||
|
|
||||||
class Filter(_base.Filter):
|
class Filter(base.Filter):
|
||||||
|
"""Removes optional tags from the token stream"""
|
||||||
def slider(self):
|
def slider(self):
|
||||||
previous1 = previous2 = None
|
previous1 = previous2 = None
|
||||||
for token in self.source:
|
for token in self.source:
|
||||||
|
@ -11,7 +12,8 @@ class Filter(_base.Filter):
|
||||||
yield previous2, previous1, token
|
yield previous2, previous1, token
|
||||||
previous2 = previous1
|
previous2 = previous1
|
||||||
previous1 = token
|
previous1 = token
|
||||||
yield previous2, previous1, None
|
if previous1 is not None:
|
||||||
|
yield previous2, previous1, None
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
for previous, token, next in self.slider():
|
for previous, token, next in self.slider():
|
||||||
|
@ -58,7 +60,7 @@ class Filter(_base.Filter):
|
||||||
elif tagname == 'colgroup':
|
elif tagname == 'colgroup':
|
||||||
# A colgroup element's start tag may be omitted if the first thing
|
# A colgroup element's start tag may be omitted if the first thing
|
||||||
# inside the colgroup element is a col element, and if the element
|
# inside the colgroup element is a col element, and if the element
|
||||||
# is not immediately preceeded by another colgroup element whose
|
# is not immediately preceded by another colgroup element whose
|
||||||
# end tag has been omitted.
|
# end tag has been omitted.
|
||||||
if type in ("StartTag", "EmptyTag"):
|
if type in ("StartTag", "EmptyTag"):
|
||||||
# XXX: we do not look at the preceding event, so instead we never
|
# XXX: we do not look at the preceding event, so instead we never
|
||||||
|
@ -70,7 +72,7 @@ class Filter(_base.Filter):
|
||||||
elif tagname == 'tbody':
|
elif tagname == 'tbody':
|
||||||
# A tbody element's start tag may be omitted if the first thing
|
# A tbody element's start tag may be omitted if the first thing
|
||||||
# inside the tbody element is a tr element, and if the element is
|
# inside the tbody element is a tr element, and if the element is
|
||||||
# not immediately preceeded by a tbody, thead, or tfoot element
|
# not immediately preceded by a tbody, thead, or tfoot element
|
||||||
# whose end tag has been omitted.
|
# whose end tag has been omitted.
|
||||||
if type == "StartTag":
|
if type == "StartTag":
|
||||||
# omit the thead and tfoot elements' end tag when they are
|
# omit the thead and tfoot elements' end tag when they are
|
896
testing/web-platform/tests/tools/third_party/html5lib/html5lib/filters/sanitizer.py
поставляемый
Normal file
896
testing/web-platform/tests/tools/third_party/html5lib/html5lib/filters/sanitizer.py
поставляемый
Normal file
|
@ -0,0 +1,896 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
import re
|
||||||
|
from xml.sax.saxutils import escape, unescape
|
||||||
|
|
||||||
|
from six.moves import urllib_parse as urlparse
|
||||||
|
|
||||||
|
from . import base
|
||||||
|
from ..constants import namespaces, prefixes
|
||||||
|
|
||||||
|
__all__ = ["Filter"]
|
||||||
|
|
||||||
|
|
||||||
|
allowed_elements = frozenset((
|
||||||
|
(namespaces['html'], 'a'),
|
||||||
|
(namespaces['html'], 'abbr'),
|
||||||
|
(namespaces['html'], 'acronym'),
|
||||||
|
(namespaces['html'], 'address'),
|
||||||
|
(namespaces['html'], 'area'),
|
||||||
|
(namespaces['html'], 'article'),
|
||||||
|
(namespaces['html'], 'aside'),
|
||||||
|
(namespaces['html'], 'audio'),
|
||||||
|
(namespaces['html'], 'b'),
|
||||||
|
(namespaces['html'], 'big'),
|
||||||
|
(namespaces['html'], 'blockquote'),
|
||||||
|
(namespaces['html'], 'br'),
|
||||||
|
(namespaces['html'], 'button'),
|
||||||
|
(namespaces['html'], 'canvas'),
|
||||||
|
(namespaces['html'], 'caption'),
|
||||||
|
(namespaces['html'], 'center'),
|
||||||
|
(namespaces['html'], 'cite'),
|
||||||
|
(namespaces['html'], 'code'),
|
||||||
|
(namespaces['html'], 'col'),
|
||||||
|
(namespaces['html'], 'colgroup'),
|
||||||
|
(namespaces['html'], 'command'),
|
||||||
|
(namespaces['html'], 'datagrid'),
|
||||||
|
(namespaces['html'], 'datalist'),
|
||||||
|
(namespaces['html'], 'dd'),
|
||||||
|
(namespaces['html'], 'del'),
|
||||||
|
(namespaces['html'], 'details'),
|
||||||
|
(namespaces['html'], 'dfn'),
|
||||||
|
(namespaces['html'], 'dialog'),
|
||||||
|
(namespaces['html'], 'dir'),
|
||||||
|
(namespaces['html'], 'div'),
|
||||||
|
(namespaces['html'], 'dl'),
|
||||||
|
(namespaces['html'], 'dt'),
|
||||||
|
(namespaces['html'], 'em'),
|
||||||
|
(namespaces['html'], 'event-source'),
|
||||||
|
(namespaces['html'], 'fieldset'),
|
||||||
|
(namespaces['html'], 'figcaption'),
|
||||||
|
(namespaces['html'], 'figure'),
|
||||||
|
(namespaces['html'], 'footer'),
|
||||||
|
(namespaces['html'], 'font'),
|
||||||
|
(namespaces['html'], 'form'),
|
||||||
|
(namespaces['html'], 'header'),
|
||||||
|
(namespaces['html'], 'h1'),
|
||||||
|
(namespaces['html'], 'h2'),
|
||||||
|
(namespaces['html'], 'h3'),
|
||||||
|
(namespaces['html'], 'h4'),
|
||||||
|
(namespaces['html'], 'h5'),
|
||||||
|
(namespaces['html'], 'h6'),
|
||||||
|
(namespaces['html'], 'hr'),
|
||||||
|
(namespaces['html'], 'i'),
|
||||||
|
(namespaces['html'], 'img'),
|
||||||
|
(namespaces['html'], 'input'),
|
||||||
|
(namespaces['html'], 'ins'),
|
||||||
|
(namespaces['html'], 'keygen'),
|
||||||
|
(namespaces['html'], 'kbd'),
|
||||||
|
(namespaces['html'], 'label'),
|
||||||
|
(namespaces['html'], 'legend'),
|
||||||
|
(namespaces['html'], 'li'),
|
||||||
|
(namespaces['html'], 'm'),
|
||||||
|
(namespaces['html'], 'map'),
|
||||||
|
(namespaces['html'], 'menu'),
|
||||||
|
(namespaces['html'], 'meter'),
|
||||||
|
(namespaces['html'], 'multicol'),
|
||||||
|
(namespaces['html'], 'nav'),
|
||||||
|
(namespaces['html'], 'nextid'),
|
||||||
|
(namespaces['html'], 'ol'),
|
||||||
|
(namespaces['html'], 'output'),
|
||||||
|
(namespaces['html'], 'optgroup'),
|
||||||
|
(namespaces['html'], 'option'),
|
||||||
|
(namespaces['html'], 'p'),
|
||||||
|
(namespaces['html'], 'pre'),
|
||||||
|
(namespaces['html'], 'progress'),
|
||||||
|
(namespaces['html'], 'q'),
|
||||||
|
(namespaces['html'], 's'),
|
||||||
|
(namespaces['html'], 'samp'),
|
||||||
|
(namespaces['html'], 'section'),
|
||||||
|
(namespaces['html'], 'select'),
|
||||||
|
(namespaces['html'], 'small'),
|
||||||
|
(namespaces['html'], 'sound'),
|
||||||
|
(namespaces['html'], 'source'),
|
||||||
|
(namespaces['html'], 'spacer'),
|
||||||
|
(namespaces['html'], 'span'),
|
||||||
|
(namespaces['html'], 'strike'),
|
||||||
|
(namespaces['html'], 'strong'),
|
||||||
|
(namespaces['html'], 'sub'),
|
||||||
|
(namespaces['html'], 'sup'),
|
||||||
|
(namespaces['html'], 'table'),
|
||||||
|
(namespaces['html'], 'tbody'),
|
||||||
|
(namespaces['html'], 'td'),
|
||||||
|
(namespaces['html'], 'textarea'),
|
||||||
|
(namespaces['html'], 'time'),
|
||||||
|
(namespaces['html'], 'tfoot'),
|
||||||
|
(namespaces['html'], 'th'),
|
||||||
|
(namespaces['html'], 'thead'),
|
||||||
|
(namespaces['html'], 'tr'),
|
||||||
|
(namespaces['html'], 'tt'),
|
||||||
|
(namespaces['html'], 'u'),
|
||||||
|
(namespaces['html'], 'ul'),
|
||||||
|
(namespaces['html'], 'var'),
|
||||||
|
(namespaces['html'], 'video'),
|
||||||
|
(namespaces['mathml'], 'maction'),
|
||||||
|
(namespaces['mathml'], 'math'),
|
||||||
|
(namespaces['mathml'], 'merror'),
|
||||||
|
(namespaces['mathml'], 'mfrac'),
|
||||||
|
(namespaces['mathml'], 'mi'),
|
||||||
|
(namespaces['mathml'], 'mmultiscripts'),
|
||||||
|
(namespaces['mathml'], 'mn'),
|
||||||
|
(namespaces['mathml'], 'mo'),
|
||||||
|
(namespaces['mathml'], 'mover'),
|
||||||
|
(namespaces['mathml'], 'mpadded'),
|
||||||
|
(namespaces['mathml'], 'mphantom'),
|
||||||
|
(namespaces['mathml'], 'mprescripts'),
|
||||||
|
(namespaces['mathml'], 'mroot'),
|
||||||
|
(namespaces['mathml'], 'mrow'),
|
||||||
|
(namespaces['mathml'], 'mspace'),
|
||||||
|
(namespaces['mathml'], 'msqrt'),
|
||||||
|
(namespaces['mathml'], 'mstyle'),
|
||||||
|
(namespaces['mathml'], 'msub'),
|
||||||
|
(namespaces['mathml'], 'msubsup'),
|
||||||
|
(namespaces['mathml'], 'msup'),
|
||||||
|
(namespaces['mathml'], 'mtable'),
|
||||||
|
(namespaces['mathml'], 'mtd'),
|
||||||
|
(namespaces['mathml'], 'mtext'),
|
||||||
|
(namespaces['mathml'], 'mtr'),
|
||||||
|
(namespaces['mathml'], 'munder'),
|
||||||
|
(namespaces['mathml'], 'munderover'),
|
||||||
|
(namespaces['mathml'], 'none'),
|
||||||
|
(namespaces['svg'], 'a'),
|
||||||
|
(namespaces['svg'], 'animate'),
|
||||||
|
(namespaces['svg'], 'animateColor'),
|
||||||
|
(namespaces['svg'], 'animateMotion'),
|
||||||
|
(namespaces['svg'], 'animateTransform'),
|
||||||
|
(namespaces['svg'], 'clipPath'),
|
||||||
|
(namespaces['svg'], 'circle'),
|
||||||
|
(namespaces['svg'], 'defs'),
|
||||||
|
(namespaces['svg'], 'desc'),
|
||||||
|
(namespaces['svg'], 'ellipse'),
|
||||||
|
(namespaces['svg'], 'font-face'),
|
||||||
|
(namespaces['svg'], 'font-face-name'),
|
||||||
|
(namespaces['svg'], 'font-face-src'),
|
||||||
|
(namespaces['svg'], 'g'),
|
||||||
|
(namespaces['svg'], 'glyph'),
|
||||||
|
(namespaces['svg'], 'hkern'),
|
||||||
|
(namespaces['svg'], 'linearGradient'),
|
||||||
|
(namespaces['svg'], 'line'),
|
||||||
|
(namespaces['svg'], 'marker'),
|
||||||
|
(namespaces['svg'], 'metadata'),
|
||||||
|
(namespaces['svg'], 'missing-glyph'),
|
||||||
|
(namespaces['svg'], 'mpath'),
|
||||||
|
(namespaces['svg'], 'path'),
|
||||||
|
(namespaces['svg'], 'polygon'),
|
||||||
|
(namespaces['svg'], 'polyline'),
|
||||||
|
(namespaces['svg'], 'radialGradient'),
|
||||||
|
(namespaces['svg'], 'rect'),
|
||||||
|
(namespaces['svg'], 'set'),
|
||||||
|
(namespaces['svg'], 'stop'),
|
||||||
|
(namespaces['svg'], 'svg'),
|
||||||
|
(namespaces['svg'], 'switch'),
|
||||||
|
(namespaces['svg'], 'text'),
|
||||||
|
(namespaces['svg'], 'title'),
|
||||||
|
(namespaces['svg'], 'tspan'),
|
||||||
|
(namespaces['svg'], 'use'),
|
||||||
|
))
|
||||||
|
|
||||||
|
allowed_attributes = frozenset((
|
||||||
|
# HTML attributes
|
||||||
|
(None, 'abbr'),
|
||||||
|
(None, 'accept'),
|
||||||
|
(None, 'accept-charset'),
|
||||||
|
(None, 'accesskey'),
|
||||||
|
(None, 'action'),
|
||||||
|
(None, 'align'),
|
||||||
|
(None, 'alt'),
|
||||||
|
(None, 'autocomplete'),
|
||||||
|
(None, 'autofocus'),
|
||||||
|
(None, 'axis'),
|
||||||
|
(None, 'background'),
|
||||||
|
(None, 'balance'),
|
||||||
|
(None, 'bgcolor'),
|
||||||
|
(None, 'bgproperties'),
|
||||||
|
(None, 'border'),
|
||||||
|
(None, 'bordercolor'),
|
||||||
|
(None, 'bordercolordark'),
|
||||||
|
(None, 'bordercolorlight'),
|
||||||
|
(None, 'bottompadding'),
|
||||||
|
(None, 'cellpadding'),
|
||||||
|
(None, 'cellspacing'),
|
||||||
|
(None, 'ch'),
|
||||||
|
(None, 'challenge'),
|
||||||
|
(None, 'char'),
|
||||||
|
(None, 'charoff'),
|
||||||
|
(None, 'choff'),
|
||||||
|
(None, 'charset'),
|
||||||
|
(None, 'checked'),
|
||||||
|
(None, 'cite'),
|
||||||
|
(None, 'class'),
|
||||||
|
(None, 'clear'),
|
||||||
|
(None, 'color'),
|
||||||
|
(None, 'cols'),
|
||||||
|
(None, 'colspan'),
|
||||||
|
(None, 'compact'),
|
||||||
|
(None, 'contenteditable'),
|
||||||
|
(None, 'controls'),
|
||||||
|
(None, 'coords'),
|
||||||
|
(None, 'data'),
|
||||||
|
(None, 'datafld'),
|
||||||
|
(None, 'datapagesize'),
|
||||||
|
(None, 'datasrc'),
|
||||||
|
(None, 'datetime'),
|
||||||
|
(None, 'default'),
|
||||||
|
(None, 'delay'),
|
||||||
|
(None, 'dir'),
|
||||||
|
(None, 'disabled'),
|
||||||
|
(None, 'draggable'),
|
||||||
|
(None, 'dynsrc'),
|
||||||
|
(None, 'enctype'),
|
||||||
|
(None, 'end'),
|
||||||
|
(None, 'face'),
|
||||||
|
(None, 'for'),
|
||||||
|
(None, 'form'),
|
||||||
|
(None, 'frame'),
|
||||||
|
(None, 'galleryimg'),
|
||||||
|
(None, 'gutter'),
|
||||||
|
(None, 'headers'),
|
||||||
|
(None, 'height'),
|
||||||
|
(None, 'hidefocus'),
|
||||||
|
(None, 'hidden'),
|
||||||
|
(None, 'high'),
|
||||||
|
(None, 'href'),
|
||||||
|
(None, 'hreflang'),
|
||||||
|
(None, 'hspace'),
|
||||||
|
(None, 'icon'),
|
||||||
|
(None, 'id'),
|
||||||
|
(None, 'inputmode'),
|
||||||
|
(None, 'ismap'),
|
||||||
|
(None, 'keytype'),
|
||||||
|
(None, 'label'),
|
||||||
|
(None, 'leftspacing'),
|
||||||
|
(None, 'lang'),
|
||||||
|
(None, 'list'),
|
||||||
|
(None, 'longdesc'),
|
||||||
|
(None, 'loop'),
|
||||||
|
(None, 'loopcount'),
|
||||||
|
(None, 'loopend'),
|
||||||
|
(None, 'loopstart'),
|
||||||
|
(None, 'low'),
|
||||||
|
(None, 'lowsrc'),
|
||||||
|
(None, 'max'),
|
||||||
|
(None, 'maxlength'),
|
||||||
|
(None, 'media'),
|
||||||
|
(None, 'method'),
|
||||||
|
(None, 'min'),
|
||||||
|
(None, 'multiple'),
|
||||||
|
(None, 'name'),
|
||||||
|
(None, 'nohref'),
|
||||||
|
(None, 'noshade'),
|
||||||
|
(None, 'nowrap'),
|
||||||
|
(None, 'open'),
|
||||||
|
(None, 'optimum'),
|
||||||
|
(None, 'pattern'),
|
||||||
|
(None, 'ping'),
|
||||||
|
(None, 'point-size'),
|
||||||
|
(None, 'poster'),
|
||||||
|
(None, 'pqg'),
|
||||||
|
(None, 'preload'),
|
||||||
|
(None, 'prompt'),
|
||||||
|
(None, 'radiogroup'),
|
||||||
|
(None, 'readonly'),
|
||||||
|
(None, 'rel'),
|
||||||
|
(None, 'repeat-max'),
|
||||||
|
(None, 'repeat-min'),
|
||||||
|
(None, 'replace'),
|
||||||
|
(None, 'required'),
|
||||||
|
(None, 'rev'),
|
||||||
|
(None, 'rightspacing'),
|
||||||
|
(None, 'rows'),
|
||||||
|
(None, 'rowspan'),
|
||||||
|
(None, 'rules'),
|
||||||
|
(None, 'scope'),
|
||||||
|
(None, 'selected'),
|
||||||
|
(None, 'shape'),
|
||||||
|
(None, 'size'),
|
||||||
|
(None, 'span'),
|
||||||
|
(None, 'src'),
|
||||||
|
(None, 'start'),
|
||||||
|
(None, 'step'),
|
||||||
|
(None, 'style'),
|
||||||
|
(None, 'summary'),
|
||||||
|
(None, 'suppress'),
|
||||||
|
(None, 'tabindex'),
|
||||||
|
(None, 'target'),
|
||||||
|
(None, 'template'),
|
||||||
|
(None, 'title'),
|
||||||
|
(None, 'toppadding'),
|
||||||
|
(None, 'type'),
|
||||||
|
(None, 'unselectable'),
|
||||||
|
(None, 'usemap'),
|
||||||
|
(None, 'urn'),
|
||||||
|
(None, 'valign'),
|
||||||
|
(None, 'value'),
|
||||||
|
(None, 'variable'),
|
||||||
|
(None, 'volume'),
|
||||||
|
(None, 'vspace'),
|
||||||
|
(None, 'vrml'),
|
||||||
|
(None, 'width'),
|
||||||
|
(None, 'wrap'),
|
||||||
|
(namespaces['xml'], 'lang'),
|
||||||
|
# MathML attributes
|
||||||
|
(None, 'actiontype'),
|
||||||
|
(None, 'align'),
|
||||||
|
(None, 'columnalign'),
|
||||||
|
(None, 'columnalign'),
|
||||||
|
(None, 'columnalign'),
|
||||||
|
(None, 'columnlines'),
|
||||||
|
(None, 'columnspacing'),
|
||||||
|
(None, 'columnspan'),
|
||||||
|
(None, 'depth'),
|
||||||
|
(None, 'display'),
|
||||||
|
(None, 'displaystyle'),
|
||||||
|
(None, 'equalcolumns'),
|
||||||
|
(None, 'equalrows'),
|
||||||
|
(None, 'fence'),
|
||||||
|
(None, 'fontstyle'),
|
||||||
|
(None, 'fontweight'),
|
||||||
|
(None, 'frame'),
|
||||||
|
(None, 'height'),
|
||||||
|
(None, 'linethickness'),
|
||||||
|
(None, 'lspace'),
|
||||||
|
(None, 'mathbackground'),
|
||||||
|
(None, 'mathcolor'),
|
||||||
|
(None, 'mathvariant'),
|
||||||
|
(None, 'mathvariant'),
|
||||||
|
(None, 'maxsize'),
|
||||||
|
(None, 'minsize'),
|
||||||
|
(None, 'other'),
|
||||||
|
(None, 'rowalign'),
|
||||||
|
(None, 'rowalign'),
|
||||||
|
(None, 'rowalign'),
|
||||||
|
(None, 'rowlines'),
|
||||||
|
(None, 'rowspacing'),
|
||||||
|
(None, 'rowspan'),
|
||||||
|
(None, 'rspace'),
|
||||||
|
(None, 'scriptlevel'),
|
||||||
|
(None, 'selection'),
|
||||||
|
(None, 'separator'),
|
||||||
|
(None, 'stretchy'),
|
||||||
|
(None, 'width'),
|
||||||
|
(None, 'width'),
|
||||||
|
(namespaces['xlink'], 'href'),
|
||||||
|
(namespaces['xlink'], 'show'),
|
||||||
|
(namespaces['xlink'], 'type'),
|
||||||
|
# SVG attributes
|
||||||
|
(None, 'accent-height'),
|
||||||
|
(None, 'accumulate'),
|
||||||
|
(None, 'additive'),
|
||||||
|
(None, 'alphabetic'),
|
||||||
|
(None, 'arabic-form'),
|
||||||
|
(None, 'ascent'),
|
||||||
|
(None, 'attributeName'),
|
||||||
|
(None, 'attributeType'),
|
||||||
|
(None, 'baseProfile'),
|
||||||
|
(None, 'bbox'),
|
||||||
|
(None, 'begin'),
|
||||||
|
(None, 'by'),
|
||||||
|
(None, 'calcMode'),
|
||||||
|
(None, 'cap-height'),
|
||||||
|
(None, 'class'),
|
||||||
|
(None, 'clip-path'),
|
||||||
|
(None, 'color'),
|
||||||
|
(None, 'color-rendering'),
|
||||||
|
(None, 'content'),
|
||||||
|
(None, 'cx'),
|
||||||
|
(None, 'cy'),
|
||||||
|
(None, 'd'),
|
||||||
|
(None, 'dx'),
|
||||||
|
(None, 'dy'),
|
||||||
|
(None, 'descent'),
|
||||||
|
(None, 'display'),
|
||||||
|
(None, 'dur'),
|
||||||
|
(None, 'end'),
|
||||||
|
(None, 'fill'),
|
||||||
|
(None, 'fill-opacity'),
|
||||||
|
(None, 'fill-rule'),
|
||||||
|
(None, 'font-family'),
|
||||||
|
(None, 'font-size'),
|
||||||
|
(None, 'font-stretch'),
|
||||||
|
(None, 'font-style'),
|
||||||
|
(None, 'font-variant'),
|
||||||
|
(None, 'font-weight'),
|
||||||
|
(None, 'from'),
|
||||||
|
(None, 'fx'),
|
||||||
|
(None, 'fy'),
|
||||||
|
(None, 'g1'),
|
||||||
|
(None, 'g2'),
|
||||||
|
(None, 'glyph-name'),
|
||||||
|
(None, 'gradientUnits'),
|
||||||
|
(None, 'hanging'),
|
||||||
|
(None, 'height'),
|
||||||
|
(None, 'horiz-adv-x'),
|
||||||
|
(None, 'horiz-origin-x'),
|
||||||
|
(None, 'id'),
|
||||||
|
(None, 'ideographic'),
|
||||||
|
(None, 'k'),
|
||||||
|
(None, 'keyPoints'),
|
||||||
|
(None, 'keySplines'),
|
||||||
|
(None, 'keyTimes'),
|
||||||
|
(None, 'lang'),
|
||||||
|
(None, 'marker-end'),
|
||||||
|
(None, 'marker-mid'),
|
||||||
|
(None, 'marker-start'),
|
||||||
|
(None, 'markerHeight'),
|
||||||
|
(None, 'markerUnits'),
|
||||||
|
(None, 'markerWidth'),
|
||||||
|
(None, 'mathematical'),
|
||||||
|
(None, 'max'),
|
||||||
|
(None, 'min'),
|
||||||
|
(None, 'name'),
|
||||||
|
(None, 'offset'),
|
||||||
|
(None, 'opacity'),
|
||||||
|
(None, 'orient'),
|
||||||
|
(None, 'origin'),
|
||||||
|
(None, 'overline-position'),
|
||||||
|
(None, 'overline-thickness'),
|
||||||
|
(None, 'panose-1'),
|
||||||
|
(None, 'path'),
|
||||||
|
(None, 'pathLength'),
|
||||||
|
(None, 'points'),
|
||||||
|
(None, 'preserveAspectRatio'),
|
||||||
|
(None, 'r'),
|
||||||
|
(None, 'refX'),
|
||||||
|
(None, 'refY'),
|
||||||
|
(None, 'repeatCount'),
|
||||||
|
(None, 'repeatDur'),
|
||||||
|
(None, 'requiredExtensions'),
|
||||||
|
(None, 'requiredFeatures'),
|
||||||
|
(None, 'restart'),
|
||||||
|
(None, 'rotate'),
|
||||||
|
(None, 'rx'),
|
||||||
|
(None, 'ry'),
|
||||||
|
(None, 'slope'),
|
||||||
|
(None, 'stemh'),
|
||||||
|
(None, 'stemv'),
|
||||||
|
(None, 'stop-color'),
|
||||||
|
(None, 'stop-opacity'),
|
||||||
|
(None, 'strikethrough-position'),
|
||||||
|
(None, 'strikethrough-thickness'),
|
||||||
|
(None, 'stroke'),
|
||||||
|
(None, 'stroke-dasharray'),
|
||||||
|
(None, 'stroke-dashoffset'),
|
||||||
|
(None, 'stroke-linecap'),
|
||||||
|
(None, 'stroke-linejoin'),
|
||||||
|
(None, 'stroke-miterlimit'),
|
||||||
|
(None, 'stroke-opacity'),
|
||||||
|
(None, 'stroke-width'),
|
||||||
|
(None, 'systemLanguage'),
|
||||||
|
(None, 'target'),
|
||||||
|
(None, 'text-anchor'),
|
||||||
|
(None, 'to'),
|
||||||
|
(None, 'transform'),
|
||||||
|
(None, 'type'),
|
||||||
|
(None, 'u1'),
|
||||||
|
(None, 'u2'),
|
||||||
|
(None, 'underline-position'),
|
||||||
|
(None, 'underline-thickness'),
|
||||||
|
(None, 'unicode'),
|
||||||
|
(None, 'unicode-range'),
|
||||||
|
(None, 'units-per-em'),
|
||||||
|
(None, 'values'),
|
||||||
|
(None, 'version'),
|
||||||
|
(None, 'viewBox'),
|
||||||
|
(None, 'visibility'),
|
||||||
|
(None, 'width'),
|
||||||
|
(None, 'widths'),
|
||||||
|
(None, 'x'),
|
||||||
|
(None, 'x-height'),
|
||||||
|
(None, 'x1'),
|
||||||
|
(None, 'x2'),
|
||||||
|
(namespaces['xlink'], 'actuate'),
|
||||||
|
(namespaces['xlink'], 'arcrole'),
|
||||||
|
(namespaces['xlink'], 'href'),
|
||||||
|
(namespaces['xlink'], 'role'),
|
||||||
|
(namespaces['xlink'], 'show'),
|
||||||
|
(namespaces['xlink'], 'title'),
|
||||||
|
(namespaces['xlink'], 'type'),
|
||||||
|
(namespaces['xml'], 'base'),
|
||||||
|
(namespaces['xml'], 'lang'),
|
||||||
|
(namespaces['xml'], 'space'),
|
||||||
|
(None, 'y'),
|
||||||
|
(None, 'y1'),
|
||||||
|
(None, 'y2'),
|
||||||
|
(None, 'zoomAndPan'),
|
||||||
|
))
|
||||||
|
|
||||||
|
attr_val_is_uri = frozenset((
|
||||||
|
(None, 'href'),
|
||||||
|
(None, 'src'),
|
||||||
|
(None, 'cite'),
|
||||||
|
(None, 'action'),
|
||||||
|
(None, 'longdesc'),
|
||||||
|
(None, 'poster'),
|
||||||
|
(None, 'background'),
|
||||||
|
(None, 'datasrc'),
|
||||||
|
(None, 'dynsrc'),
|
||||||
|
(None, 'lowsrc'),
|
||||||
|
(None, 'ping'),
|
||||||
|
(namespaces['xlink'], 'href'),
|
||||||
|
(namespaces['xml'], 'base'),
|
||||||
|
))
|
||||||
|
|
||||||
|
svg_attr_val_allows_ref = frozenset((
|
||||||
|
(None, 'clip-path'),
|
||||||
|
(None, 'color-profile'),
|
||||||
|
(None, 'cursor'),
|
||||||
|
(None, 'fill'),
|
||||||
|
(None, 'filter'),
|
||||||
|
(None, 'marker'),
|
||||||
|
(None, 'marker-start'),
|
||||||
|
(None, 'marker-mid'),
|
||||||
|
(None, 'marker-end'),
|
||||||
|
(None, 'mask'),
|
||||||
|
(None, 'stroke'),
|
||||||
|
))
|
||||||
|
|
||||||
|
svg_allow_local_href = frozenset((
|
||||||
|
(None, 'altGlyph'),
|
||||||
|
(None, 'animate'),
|
||||||
|
(None, 'animateColor'),
|
||||||
|
(None, 'animateMotion'),
|
||||||
|
(None, 'animateTransform'),
|
||||||
|
(None, 'cursor'),
|
||||||
|
(None, 'feImage'),
|
||||||
|
(None, 'filter'),
|
||||||
|
(None, 'linearGradient'),
|
||||||
|
(None, 'pattern'),
|
||||||
|
(None, 'radialGradient'),
|
||||||
|
(None, 'textpath'),
|
||||||
|
(None, 'tref'),
|
||||||
|
(None, 'set'),
|
||||||
|
(None, 'use')
|
||||||
|
))
|
||||||
|
|
||||||
|
allowed_css_properties = frozenset((
|
||||||
|
'azimuth',
|
||||||
|
'background-color',
|
||||||
|
'border-bottom-color',
|
||||||
|
'border-collapse',
|
||||||
|
'border-color',
|
||||||
|
'border-left-color',
|
||||||
|
'border-right-color',
|
||||||
|
'border-top-color',
|
||||||
|
'clear',
|
||||||
|
'color',
|
||||||
|
'cursor',
|
||||||
|
'direction',
|
||||||
|
'display',
|
||||||
|
'elevation',
|
||||||
|
'float',
|
||||||
|
'font',
|
||||||
|
'font-family',
|
||||||
|
'font-size',
|
||||||
|
'font-style',
|
||||||
|
'font-variant',
|
||||||
|
'font-weight',
|
||||||
|
'height',
|
||||||
|
'letter-spacing',
|
||||||
|
'line-height',
|
||||||
|
'overflow',
|
||||||
|
'pause',
|
||||||
|
'pause-after',
|
||||||
|
'pause-before',
|
||||||
|
'pitch',
|
||||||
|
'pitch-range',
|
||||||
|
'richness',
|
||||||
|
'speak',
|
||||||
|
'speak-header',
|
||||||
|
'speak-numeral',
|
||||||
|
'speak-punctuation',
|
||||||
|
'speech-rate',
|
||||||
|
'stress',
|
||||||
|
'text-align',
|
||||||
|
'text-decoration',
|
||||||
|
'text-indent',
|
||||||
|
'unicode-bidi',
|
||||||
|
'vertical-align',
|
||||||
|
'voice-family',
|
||||||
|
'volume',
|
||||||
|
'white-space',
|
||||||
|
'width',
|
||||||
|
))
|
||||||
|
|
||||||
|
allowed_css_keywords = frozenset((
|
||||||
|
'auto',
|
||||||
|
'aqua',
|
||||||
|
'black',
|
||||||
|
'block',
|
||||||
|
'blue',
|
||||||
|
'bold',
|
||||||
|
'both',
|
||||||
|
'bottom',
|
||||||
|
'brown',
|
||||||
|
'center',
|
||||||
|
'collapse',
|
||||||
|
'dashed',
|
||||||
|
'dotted',
|
||||||
|
'fuchsia',
|
||||||
|
'gray',
|
||||||
|
'green',
|
||||||
|
'!important',
|
||||||
|
'italic',
|
||||||
|
'left',
|
||||||
|
'lime',
|
||||||
|
'maroon',
|
||||||
|
'medium',
|
||||||
|
'none',
|
||||||
|
'navy',
|
||||||
|
'normal',
|
||||||
|
'nowrap',
|
||||||
|
'olive',
|
||||||
|
'pointer',
|
||||||
|
'purple',
|
||||||
|
'red',
|
||||||
|
'right',
|
||||||
|
'solid',
|
||||||
|
'silver',
|
||||||
|
'teal',
|
||||||
|
'top',
|
||||||
|
'transparent',
|
||||||
|
'underline',
|
||||||
|
'white',
|
||||||
|
'yellow',
|
||||||
|
))
|
||||||
|
|
||||||
|
allowed_svg_properties = frozenset((
|
||||||
|
'fill',
|
||||||
|
'fill-opacity',
|
||||||
|
'fill-rule',
|
||||||
|
'stroke',
|
||||||
|
'stroke-width',
|
||||||
|
'stroke-linecap',
|
||||||
|
'stroke-linejoin',
|
||||||
|
'stroke-opacity',
|
||||||
|
))
|
||||||
|
|
||||||
|
allowed_protocols = frozenset((
|
||||||
|
'ed2k',
|
||||||
|
'ftp',
|
||||||
|
'http',
|
||||||
|
'https',
|
||||||
|
'irc',
|
||||||
|
'mailto',
|
||||||
|
'news',
|
||||||
|
'gopher',
|
||||||
|
'nntp',
|
||||||
|
'telnet',
|
||||||
|
'webcal',
|
||||||
|
'xmpp',
|
||||||
|
'callto',
|
||||||
|
'feed',
|
||||||
|
'urn',
|
||||||
|
'aim',
|
||||||
|
'rsync',
|
||||||
|
'tag',
|
||||||
|
'ssh',
|
||||||
|
'sftp',
|
||||||
|
'rtsp',
|
||||||
|
'afs',
|
||||||
|
'data',
|
||||||
|
))
|
||||||
|
|
||||||
|
allowed_content_types = frozenset((
|
||||||
|
'image/png',
|
||||||
|
'image/jpeg',
|
||||||
|
'image/gif',
|
||||||
|
'image/webp',
|
||||||
|
'image/bmp',
|
||||||
|
'text/plain',
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
data_content_type = re.compile(r'''
|
||||||
|
^
|
||||||
|
# Match a content type <application>/<type>
|
||||||
|
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
|
||||||
|
# Match any character set and encoding
|
||||||
|
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|
||||||
|
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
|
||||||
|
# Assume the rest is data
|
||||||
|
,.*
|
||||||
|
$
|
||||||
|
''',
|
||||||
|
re.VERBOSE)
|
||||||
|
|
||||||
|
|
||||||
|
class Filter(base.Filter):
|
||||||
|
"""Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
|
||||||
|
def __init__(self,
|
||||||
|
source,
|
||||||
|
allowed_elements=allowed_elements,
|
||||||
|
allowed_attributes=allowed_attributes,
|
||||||
|
allowed_css_properties=allowed_css_properties,
|
||||||
|
allowed_css_keywords=allowed_css_keywords,
|
||||||
|
allowed_svg_properties=allowed_svg_properties,
|
||||||
|
allowed_protocols=allowed_protocols,
|
||||||
|
allowed_content_types=allowed_content_types,
|
||||||
|
attr_val_is_uri=attr_val_is_uri,
|
||||||
|
svg_attr_val_allows_ref=svg_attr_val_allows_ref,
|
||||||
|
svg_allow_local_href=svg_allow_local_href):
|
||||||
|
"""Creates a Filter
|
||||||
|
|
||||||
|
:arg allowed_elements: set of elements to allow--everything else will
|
||||||
|
be escaped
|
||||||
|
|
||||||
|
:arg allowed_attributes: set of attributes to allow in
|
||||||
|
elements--everything else will be stripped
|
||||||
|
|
||||||
|
:arg allowed_css_properties: set of CSS properties to allow--everything
|
||||||
|
else will be stripped
|
||||||
|
|
||||||
|
:arg allowed_css_keywords: set of CSS keywords to allow--everything
|
||||||
|
else will be stripped
|
||||||
|
|
||||||
|
:arg allowed_svg_properties: set of SVG properties to allow--everything
|
||||||
|
else will be removed
|
||||||
|
|
||||||
|
:arg allowed_protocols: set of allowed protocols for URIs
|
||||||
|
|
||||||
|
:arg allowed_content_types: set of allowed content types for ``data`` URIs.
|
||||||
|
|
||||||
|
:arg attr_val_is_uri: set of attributes that have URI values--values
|
||||||
|
that have a scheme not listed in ``allowed_protocols`` are removed
|
||||||
|
|
||||||
|
:arg svg_attr_val_allows_ref: set of SVG attributes that can have
|
||||||
|
references
|
||||||
|
|
||||||
|
:arg svg_allow_local_href: set of SVG elements that can have local
|
||||||
|
hrefs--these are removed
|
||||||
|
|
||||||
|
"""
|
||||||
|
super(Filter, self).__init__(source)
|
||||||
|
self.allowed_elements = allowed_elements
|
||||||
|
self.allowed_attributes = allowed_attributes
|
||||||
|
self.allowed_css_properties = allowed_css_properties
|
||||||
|
self.allowed_css_keywords = allowed_css_keywords
|
||||||
|
self.allowed_svg_properties = allowed_svg_properties
|
||||||
|
self.allowed_protocols = allowed_protocols
|
||||||
|
self.allowed_content_types = allowed_content_types
|
||||||
|
self.attr_val_is_uri = attr_val_is_uri
|
||||||
|
self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
|
||||||
|
self.svg_allow_local_href = svg_allow_local_href
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for token in base.Filter.__iter__(self):
|
||||||
|
token = self.sanitize_token(token)
|
||||||
|
if token:
|
||||||
|
yield token
|
||||||
|
|
||||||
|
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||||
|
# stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
|
||||||
|
# are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
|
||||||
|
# ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
|
||||||
|
# are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
|
||||||
|
# allowed.
|
||||||
|
#
|
||||||
|
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||||
|
# => <script> do_nasty_stuff() </script>
|
||||||
|
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||||
|
# => <a>Click here for $100</a>
|
||||||
|
def sanitize_token(self, token):
|
||||||
|
|
||||||
|
# accommodate filters which use token_type differently
|
||||||
|
token_type = token["type"]
|
||||||
|
if token_type in ("StartTag", "EndTag", "EmptyTag"):
|
||||||
|
name = token["name"]
|
||||||
|
namespace = token["namespace"]
|
||||||
|
if ((namespace, name) in self.allowed_elements or
|
||||||
|
(namespace is None and
|
||||||
|
(namespaces["html"], name) in self.allowed_elements)):
|
||||||
|
return self.allowed_token(token)
|
||||||
|
else:
|
||||||
|
return self.disallowed_token(token)
|
||||||
|
elif token_type == "Comment":
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
return token
|
||||||
|
|
||||||
|
def allowed_token(self, token):
|
||||||
|
if "data" in token:
|
||||||
|
attrs = token["data"]
|
||||||
|
attr_names = set(attrs.keys())
|
||||||
|
|
||||||
|
# Remove forbidden attributes
|
||||||
|
for to_remove in (attr_names - self.allowed_attributes):
|
||||||
|
del token["data"][to_remove]
|
||||||
|
attr_names.remove(to_remove)
|
||||||
|
|
||||||
|
# Remove attributes with disallowed URL values
|
||||||
|
for attr in (attr_names & self.attr_val_is_uri):
|
||||||
|
assert attr in attrs
|
||||||
|
# I don't have a clue where this regexp comes from or why it matches those
|
||||||
|
# characters, nor why we call unescape. I just know it's always been here.
|
||||||
|
# Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
|
||||||
|
# this will do is remove *more* than it otherwise would.
|
||||||
|
val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
|
||||||
|
unescape(attrs[attr])).lower()
|
||||||
|
# remove replacement characters from unescaped characters
|
||||||
|
val_unescaped = val_unescaped.replace("\ufffd", "")
|
||||||
|
try:
|
||||||
|
uri = urlparse.urlparse(val_unescaped)
|
||||||
|
except ValueError:
|
||||||
|
uri = None
|
||||||
|
del attrs[attr]
|
||||||
|
if uri and uri.scheme:
|
||||||
|
if uri.scheme not in self.allowed_protocols:
|
||||||
|
del attrs[attr]
|
||||||
|
if uri.scheme == 'data':
|
||||||
|
m = data_content_type.match(uri.path)
|
||||||
|
if not m:
|
||||||
|
del attrs[attr]
|
||||||
|
elif m.group('content_type') not in self.allowed_content_types:
|
||||||
|
del attrs[attr]
|
||||||
|
|
||||||
|
for attr in self.svg_attr_val_allows_ref:
|
||||||
|
if attr in attrs:
|
||||||
|
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
||||||
|
' ',
|
||||||
|
unescape(attrs[attr]))
|
||||||
|
if (token["name"] in self.svg_allow_local_href and
|
||||||
|
(namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
|
||||||
|
attrs[(namespaces['xlink'], 'href')])):
|
||||||
|
del attrs[(namespaces['xlink'], 'href')]
|
||||||
|
if (None, 'style') in attrs:
|
||||||
|
attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
|
||||||
|
token["data"] = attrs
|
||||||
|
return token
|
||||||
|
|
||||||
|
def disallowed_token(self, token):
|
||||||
|
token_type = token["type"]
|
||||||
|
if token_type == "EndTag":
|
||||||
|
token["data"] = "</%s>" % token["name"]
|
||||||
|
elif token["data"]:
|
||||||
|
assert token_type in ("StartTag", "EmptyTag")
|
||||||
|
attrs = []
|
||||||
|
for (ns, name), v in token["data"].items():
|
||||||
|
attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
|
||||||
|
token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
|
||||||
|
else:
|
||||||
|
token["data"] = "<%s>" % token["name"]
|
||||||
|
if token.get("selfClosing"):
|
||||||
|
token["data"] = token["data"][:-1] + "/>"
|
||||||
|
|
||||||
|
token["type"] = "Characters"
|
||||||
|
|
||||||
|
del token["name"]
|
||||||
|
return token
|
||||||
|
|
||||||
|
def sanitize_css(self, style):
|
||||||
|
# disallow urls
|
||||||
|
style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
||||||
|
|
||||||
|
# gauntlet
|
||||||
|
if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
|
||||||
|
return ''
|
||||||
|
if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
||||||
|
return ''
|
||||||
|
|
||||||
|
clean = []
|
||||||
|
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
|
||||||
|
if not value:
|
||||||
|
continue
|
||||||
|
if prop.lower() in self.allowed_css_properties:
|
||||||
|
clean.append(prop + ': ' + value + ';')
|
||||||
|
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
|
||||||
|
'padding']:
|
||||||
|
for keyword in value.split():
|
||||||
|
if keyword not in self.allowed_css_keywords and \
|
||||||
|
not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
clean.append(prop + ': ' + value + ';')
|
||||||
|
elif prop.lower() in self.allowed_svg_properties:
|
||||||
|
clean.append(prop + ': ' + value + ';')
|
||||||
|
|
||||||
|
return ' '.join(clean)
|
|
@ -2,20 +2,20 @@ from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from . import _base
|
from . import base
|
||||||
from ..constants import rcdataElements, spaceCharacters
|
from ..constants import rcdataElements, spaceCharacters
|
||||||
spaceCharacters = "".join(spaceCharacters)
|
spaceCharacters = "".join(spaceCharacters)
|
||||||
|
|
||||||
SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
|
SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
|
||||||
|
|
||||||
|
|
||||||
class Filter(_base.Filter):
|
class Filter(base.Filter):
|
||||||
|
"""Collapses whitespace except in pre, textarea, and script elements"""
|
||||||
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
preserve = 0
|
preserve = 0
|
||||||
for token in _base.Filter.__iter__(self):
|
for token in base.Filter.__iter__(self):
|
||||||
type = token["type"]
|
type = token["type"]
|
||||||
if type == "StartTag" \
|
if type == "StartTag" \
|
||||||
and (preserve or token["name"] in self.spacePreserveElements):
|
and (preserve or token["name"] in self.spacePreserveElements):
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
409
testing/web-platform/tests/tools/third_party/html5lib/html5lib/serializer.py
поставляемый
Normal file
409
testing/web-platform/tests/tools/third_party/html5lib/html5lib/serializer.py
поставляемый
Normal file
|
@ -0,0 +1,409 @@
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
from six import text_type
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from codecs import register_error, xmlcharrefreplace_errors
|
||||||
|
|
||||||
|
from .constants import voidElements, booleanAttributes, spaceCharacters
|
||||||
|
from .constants import rcdataElements, entities, xmlEntities
|
||||||
|
from . import treewalkers, _utils
|
||||||
|
from xml.sax.saxutils import escape
|
||||||
|
|
||||||
|
_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
|
||||||
|
_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
|
||||||
|
_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
|
||||||
|
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
|
||||||
|
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
|
||||||
|
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
|
||||||
|
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
|
||||||
|
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
|
||||||
|
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
|
||||||
|
"\u3000]")
|
||||||
|
|
||||||
|
|
||||||
|
_encode_entity_map = {}
|
||||||
|
_is_ucs4 = len("\U0010FFFF") == 1
|
||||||
|
for k, v in list(entities.items()):
|
||||||
|
# skip multi-character entities
|
||||||
|
if ((_is_ucs4 and len(v) > 1) or
|
||||||
|
(not _is_ucs4 and len(v) > 2)):
|
||||||
|
continue
|
||||||
|
if v != "&":
|
||||||
|
if len(v) == 2:
|
||||||
|
v = _utils.surrogatePairToCodepoint(v)
|
||||||
|
else:
|
||||||
|
v = ord(v)
|
||||||
|
if v not in _encode_entity_map or k.islower():
|
||||||
|
# prefer < over < and similarly for &, >, etc.
|
||||||
|
_encode_entity_map[v] = k
|
||||||
|
|
||||||
|
|
||||||
|
def htmlentityreplace_errors(exc):
|
||||||
|
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
||||||
|
res = []
|
||||||
|
codepoints = []
|
||||||
|
skip = False
|
||||||
|
for i, c in enumerate(exc.object[exc.start:exc.end]):
|
||||||
|
if skip:
|
||||||
|
skip = False
|
||||||
|
continue
|
||||||
|
index = i + exc.start
|
||||||
|
if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
|
||||||
|
codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
|
||||||
|
skip = True
|
||||||
|
else:
|
||||||
|
codepoint = ord(c)
|
||||||
|
codepoints.append(codepoint)
|
||||||
|
for cp in codepoints:
|
||||||
|
e = _encode_entity_map.get(cp)
|
||||||
|
if e:
|
||||||
|
res.append("&")
|
||||||
|
res.append(e)
|
||||||
|
if not e.endswith(";"):
|
||||||
|
res.append(";")
|
||||||
|
else:
|
||||||
|
res.append("&#x%s;" % (hex(cp)[2:]))
|
||||||
|
return ("".join(res), exc.end)
|
||||||
|
else:
|
||||||
|
return xmlcharrefreplace_errors(exc)
|
||||||
|
|
||||||
|
|
||||||
|
register_error("htmlentityreplace", htmlentityreplace_errors)
|
||||||
|
|
||||||
|
|
||||||
|
def serialize(input, tree="etree", encoding=None, **serializer_opts):
|
||||||
|
"""Serializes the input token stream using the specified treewalker
|
||||||
|
|
||||||
|
:arg input: the token stream to serialize
|
||||||
|
|
||||||
|
:arg tree: the treewalker to use
|
||||||
|
|
||||||
|
:arg encoding: the encoding to use
|
||||||
|
|
||||||
|
:arg serializer_opts: any options to pass to the
|
||||||
|
:py:class:`html5lib.serializer.HTMLSerializer` that gets created
|
||||||
|
|
||||||
|
:returns: the tree serialized as a string
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from html5lib.html5parser import parse
|
||||||
|
>>> from html5lib.serializer import serialize
|
||||||
|
>>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
|
||||||
|
>>> serialize(token_stream, omit_optional_tags=False)
|
||||||
|
'<html><head></head><body><p>Hi!</p></body></html>'
|
||||||
|
|
||||||
|
"""
|
||||||
|
# XXX: Should we cache this?
|
||||||
|
walker = treewalkers.getTreeWalker(tree)
|
||||||
|
s = HTMLSerializer(**serializer_opts)
|
||||||
|
return s.render(walker(input), encoding)
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLSerializer(object):
|
||||||
|
|
||||||
|
# attribute quoting options
|
||||||
|
quote_attr_values = "legacy" # be secure by default
|
||||||
|
quote_char = '"'
|
||||||
|
use_best_quote_char = True
|
||||||
|
|
||||||
|
# tag syntax options
|
||||||
|
omit_optional_tags = True
|
||||||
|
minimize_boolean_attributes = True
|
||||||
|
use_trailing_solidus = False
|
||||||
|
space_before_trailing_solidus = True
|
||||||
|
|
||||||
|
# escaping options
|
||||||
|
escape_lt_in_attrs = False
|
||||||
|
escape_rcdata = False
|
||||||
|
resolve_entities = True
|
||||||
|
|
||||||
|
# miscellaneous options
|
||||||
|
alphabetical_attributes = False
|
||||||
|
inject_meta_charset = True
|
||||||
|
strip_whitespace = False
|
||||||
|
sanitize = False
|
||||||
|
|
||||||
|
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
||||||
|
"omit_optional_tags", "minimize_boolean_attributes",
|
||||||
|
"use_trailing_solidus", "space_before_trailing_solidus",
|
||||||
|
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
|
||||||
|
"alphabetical_attributes", "inject_meta_charset",
|
||||||
|
"strip_whitespace", "sanitize")
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
"""Initialize HTMLSerializer
|
||||||
|
|
||||||
|
:arg inject_meta_charset: Whether or not to inject the meta charset.
|
||||||
|
|
||||||
|
Defaults to ``True``.
|
||||||
|
|
||||||
|
:arg quote_attr_values: Whether to quote attribute values that don't
|
||||||
|
require quoting per legacy browser behavior (``"legacy"``), when
|
||||||
|
required by the standard (``"spec"``), or always (``"always"``).
|
||||||
|
|
||||||
|
Defaults to ``"legacy"``.
|
||||||
|
|
||||||
|
:arg quote_char: Use given quote character for attribute quoting.
|
||||||
|
|
||||||
|
Defaults to ``"`` which will use double quotes unless attribute
|
||||||
|
value contains a double quote, in which case single quotes are
|
||||||
|
used.
|
||||||
|
|
||||||
|
:arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
|
||||||
|
values.
|
||||||
|
|
||||||
|
Defaults to ``False``.
|
||||||
|
|
||||||
|
:arg escape_rcdata: Whether to escape characters that need to be
|
||||||
|
escaped within normal elements within rcdata elements such as
|
||||||
|
style.
|
||||||
|
|
||||||
|
Defaults to ``False``.
|
||||||
|
|
||||||
|
:arg resolve_entities: Whether to resolve named character entities that
|
||||||
|
appear in the source tree. The XML predefined entities < >
|
||||||
|
& " ' are unaffected by this setting.
|
||||||
|
|
||||||
|
Defaults to ``True``.
|
||||||
|
|
||||||
|
:arg strip_whitespace: Whether to remove semantically meaningless
|
||||||
|
whitespace. (This compresses all whitespace to a single space
|
||||||
|
except within ``pre``.)
|
||||||
|
|
||||||
|
Defaults to ``False``.
|
||||||
|
|
||||||
|
:arg minimize_boolean_attributes: Shortens boolean attributes to give
|
||||||
|
just the attribute value, for example::
|
||||||
|
|
||||||
|
<input disabled="disabled">
|
||||||
|
|
||||||
|
becomes::
|
||||||
|
|
||||||
|
<input disabled>
|
||||||
|
|
||||||
|
Defaults to ``True``.
|
||||||
|
|
||||||
|
:arg use_trailing_solidus: Includes a close-tag slash at the end of the
|
||||||
|
start tag of void elements (empty elements whose end tag is
|
||||||
|
forbidden). E.g. ``<hr/>``.
|
||||||
|
|
||||||
|
Defaults to ``False``.
|
||||||
|
|
||||||
|
:arg space_before_trailing_solidus: Places a space immediately before
|
||||||
|
the closing slash in a tag using a trailing solidus. E.g.
|
||||||
|
``<hr />``. Requires ``use_trailing_solidus=True``.
|
||||||
|
|
||||||
|
Defaults to ``True``.
|
||||||
|
|
||||||
|
:arg sanitize: Strip all unsafe or unknown constructs from output.
|
||||||
|
See :py:class:`html5lib.filters.sanitizer.Filter`.
|
||||||
|
|
||||||
|
Defaults to ``False``.
|
||||||
|
|
||||||
|
:arg omit_optional_tags: Omit start/end tags that are optional.
|
||||||
|
|
||||||
|
Defaults to ``True``.
|
||||||
|
|
||||||
|
:arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
|
||||||
|
|
||||||
|
Defaults to ``False``.
|
||||||
|
|
||||||
|
"""
|
||||||
|
unexpected_args = frozenset(kwargs) - frozenset(self.options)
|
||||||
|
if len(unexpected_args) > 0:
|
||||||
|
raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
|
||||||
|
if 'quote_char' in kwargs:
|
||||||
|
self.use_best_quote_char = False
|
||||||
|
for attr in self.options:
|
||||||
|
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
|
||||||
|
self.errors = []
|
||||||
|
self.strict = False
|
||||||
|
|
||||||
|
def encode(self, string):
|
||||||
|
assert(isinstance(string, text_type))
|
||||||
|
if self.encoding:
|
||||||
|
return string.encode(self.encoding, "htmlentityreplace")
|
||||||
|
else:
|
||||||
|
return string
|
||||||
|
|
||||||
|
def encodeStrict(self, string):
|
||||||
|
assert(isinstance(string, text_type))
|
||||||
|
if self.encoding:
|
||||||
|
return string.encode(self.encoding, "strict")
|
||||||
|
else:
|
||||||
|
return string
|
||||||
|
|
||||||
|
def serialize(self, treewalker, encoding=None):
|
||||||
|
# pylint:disable=too-many-nested-blocks
|
||||||
|
self.encoding = encoding
|
||||||
|
in_cdata = False
|
||||||
|
self.errors = []
|
||||||
|
|
||||||
|
if encoding and self.inject_meta_charset:
|
||||||
|
from .filters.inject_meta_charset import Filter
|
||||||
|
treewalker = Filter(treewalker, encoding)
|
||||||
|
# Alphabetical attributes is here under the assumption that none of
|
||||||
|
# the later filters add or change order of attributes; it needs to be
|
||||||
|
# before the sanitizer so escaped elements come out correctly
|
||||||
|
if self.alphabetical_attributes:
|
||||||
|
from .filters.alphabeticalattributes import Filter
|
||||||
|
treewalker = Filter(treewalker)
|
||||||
|
# WhitespaceFilter should be used before OptionalTagFilter
|
||||||
|
# for maximum efficiently of this latter filter
|
||||||
|
if self.strip_whitespace:
|
||||||
|
from .filters.whitespace import Filter
|
||||||
|
treewalker = Filter(treewalker)
|
||||||
|
if self.sanitize:
|
||||||
|
from .filters.sanitizer import Filter
|
||||||
|
treewalker = Filter(treewalker)
|
||||||
|
if self.omit_optional_tags:
|
||||||
|
from .filters.optionaltags import Filter
|
||||||
|
treewalker = Filter(treewalker)
|
||||||
|
|
||||||
|
for token in treewalker:
|
||||||
|
type = token["type"]
|
||||||
|
if type == "Doctype":
|
||||||
|
doctype = "<!DOCTYPE %s" % token["name"]
|
||||||
|
|
||||||
|
if token["publicId"]:
|
||||||
|
doctype += ' PUBLIC "%s"' % token["publicId"]
|
||||||
|
elif token["systemId"]:
|
||||||
|
doctype += " SYSTEM"
|
||||||
|
if token["systemId"]:
|
||||||
|
if token["systemId"].find('"') >= 0:
|
||||||
|
if token["systemId"].find("'") >= 0:
|
||||||
|
self.serializeError("System identifer contains both single and double quote characters")
|
||||||
|
quote_char = "'"
|
||||||
|
else:
|
||||||
|
quote_char = '"'
|
||||||
|
doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
|
||||||
|
|
||||||
|
doctype += ">"
|
||||||
|
yield self.encodeStrict(doctype)
|
||||||
|
|
||||||
|
elif type in ("Characters", "SpaceCharacters"):
|
||||||
|
if type == "SpaceCharacters" or in_cdata:
|
||||||
|
if in_cdata and token["data"].find("</") >= 0:
|
||||||
|
self.serializeError("Unexpected </ in CDATA")
|
||||||
|
yield self.encode(token["data"])
|
||||||
|
else:
|
||||||
|
yield self.encode(escape(token["data"]))
|
||||||
|
|
||||||
|
elif type in ("StartTag", "EmptyTag"):
|
||||||
|
name = token["name"]
|
||||||
|
yield self.encodeStrict("<%s" % name)
|
||||||
|
if name in rcdataElements and not self.escape_rcdata:
|
||||||
|
in_cdata = True
|
||||||
|
elif in_cdata:
|
||||||
|
self.serializeError("Unexpected child element of a CDATA element")
|
||||||
|
for (_, attr_name), attr_value in token["data"].items():
|
||||||
|
# TODO: Add namespace support here
|
||||||
|
k = attr_name
|
||||||
|
v = attr_value
|
||||||
|
yield self.encodeStrict(' ')
|
||||||
|
|
||||||
|
yield self.encodeStrict(k)
|
||||||
|
if not self.minimize_boolean_attributes or \
|
||||||
|
(k not in booleanAttributes.get(name, tuple()) and
|
||||||
|
k not in booleanAttributes.get("", tuple())):
|
||||||
|
yield self.encodeStrict("=")
|
||||||
|
if self.quote_attr_values == "always" or len(v) == 0:
|
||||||
|
quote_attr = True
|
||||||
|
elif self.quote_attr_values == "spec":
|
||||||
|
quote_attr = _quoteAttributeSpec.search(v) is not None
|
||||||
|
elif self.quote_attr_values == "legacy":
|
||||||
|
quote_attr = _quoteAttributeLegacy.search(v) is not None
|
||||||
|
else:
|
||||||
|
raise ValueError("quote_attr_values must be one of: "
|
||||||
|
"'always', 'spec', or 'legacy'")
|
||||||
|
v = v.replace("&", "&")
|
||||||
|
if self.escape_lt_in_attrs:
|
||||||
|
v = v.replace("<", "<")
|
||||||
|
if quote_attr:
|
||||||
|
quote_char = self.quote_char
|
||||||
|
if self.use_best_quote_char:
|
||||||
|
if "'" in v and '"' not in v:
|
||||||
|
quote_char = '"'
|
||||||
|
elif '"' in v and "'" not in v:
|
||||||
|
quote_char = "'"
|
||||||
|
if quote_char == "'":
|
||||||
|
v = v.replace("'", "'")
|
||||||
|
else:
|
||||||
|
v = v.replace('"', """)
|
||||||
|
yield self.encodeStrict(quote_char)
|
||||||
|
yield self.encode(v)
|
||||||
|
yield self.encodeStrict(quote_char)
|
||||||
|
else:
|
||||||
|
yield self.encode(v)
|
||||||
|
if name in voidElements and self.use_trailing_solidus:
|
||||||
|
if self.space_before_trailing_solidus:
|
||||||
|
yield self.encodeStrict(" /")
|
||||||
|
else:
|
||||||
|
yield self.encodeStrict("/")
|
||||||
|
yield self.encode(">")
|
||||||
|
|
||||||
|
elif type == "EndTag":
|
||||||
|
name = token["name"]
|
||||||
|
if name in rcdataElements:
|
||||||
|
in_cdata = False
|
||||||
|
elif in_cdata:
|
||||||
|
self.serializeError("Unexpected child element of a CDATA element")
|
||||||
|
yield self.encodeStrict("</%s>" % name)
|
||||||
|
|
||||||
|
elif type == "Comment":
|
||||||
|
data = token["data"]
|
||||||
|
if data.find("--") >= 0:
|
||||||
|
self.serializeError("Comment contains --")
|
||||||
|
yield self.encodeStrict("<!--%s-->" % token["data"])
|
||||||
|
|
||||||
|
elif type == "Entity":
|
||||||
|
name = token["name"]
|
||||||
|
key = name + ";"
|
||||||
|
if key not in entities:
|
||||||
|
self.serializeError("Entity %s not recognized" % name)
|
||||||
|
if self.resolve_entities and key not in xmlEntities:
|
||||||
|
data = entities[key]
|
||||||
|
else:
|
||||||
|
data = "&%s;" % name
|
||||||
|
yield self.encodeStrict(data)
|
||||||
|
|
||||||
|
else:
|
||||||
|
self.serializeError(token["data"])
|
||||||
|
|
||||||
|
def render(self, treewalker, encoding=None):
|
||||||
|
"""Serializes the stream from the treewalker into a string
|
||||||
|
|
||||||
|
:arg treewalker: the treewalker to serialize
|
||||||
|
|
||||||
|
:arg encoding: the string encoding to use
|
||||||
|
|
||||||
|
:returns: the serialized tree
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from html5lib import parse, getTreeWalker
|
||||||
|
>>> from html5lib.serializer import HTMLSerializer
|
||||||
|
>>> token_stream = parse('<html><body>Hi!</body></html>')
|
||||||
|
>>> walker = getTreeWalker('etree')
|
||||||
|
>>> serializer = HTMLSerializer(omit_optional_tags=False)
|
||||||
|
>>> serializer.render(walker(token_stream))
|
||||||
|
'<html><head></head><body>Hi!</body></html>'
|
||||||
|
|
||||||
|
"""
|
||||||
|
if encoding:
|
||||||
|
return b"".join(list(self.serialize(treewalker, encoding)))
|
||||||
|
else:
|
||||||
|
return "".join(list(self.serialize(treewalker)))
|
||||||
|
|
||||||
|
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
|
||||||
|
# XXX The idea is to make data mandatory.
|
||||||
|
self.errors.append(data)
|
||||||
|
if self.strict:
|
||||||
|
raise SerializeError
|
||||||
|
|
||||||
|
|
||||||
|
class SerializeError(Exception):
|
||||||
|
"""Error in serialized tree"""
|
||||||
|
pass
|
108
testing/web-platform/tests/tools/third_party/html5lib/html5lib/tests/conftest.py
поставляемый
Normal file
108
testing/web-platform/tests/tools/third_party/html5lib/html5lib/tests/conftest.py
поставляемый
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
from __future__ import print_function
|
||||||
|
import os.path
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import pkg_resources
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from .tree_construction import TreeConstructionFile
|
||||||
|
from .tokenizer import TokenizerFile
|
||||||
|
from .sanitizer import SanitizerFile
|
||||||
|
|
||||||
|
_dir = os.path.abspath(os.path.dirname(__file__))
|
||||||
|
_root = os.path.join(_dir, "..", "..")
|
||||||
|
_testdata = os.path.join(_dir, "testdata")
|
||||||
|
_tree_construction = os.path.join(_testdata, "tree-construction")
|
||||||
|
_tokenizer = os.path.join(_testdata, "tokenizer")
|
||||||
|
_sanitizer_testdata = os.path.join(_dir, "sanitizer-testdata")
|
||||||
|
|
||||||
|
|
||||||
|
def fail_if_missing_pytest_expect():
|
||||||
|
"""Throws an exception halting pytest if pytest-expect isn't working"""
|
||||||
|
try:
|
||||||
|
from pytest_expect import expect # noqa
|
||||||
|
except ImportError:
|
||||||
|
header = '*' * 78
|
||||||
|
print(
|
||||||
|
'\n' +
|
||||||
|
header + '\n' +
|
||||||
|
'ERROR: Either pytest-expect or its dependency u-msgpack-python is not\n' +
|
||||||
|
'installed. Please install them both before running pytest.\n' +
|
||||||
|
header + '\n',
|
||||||
|
file=sys.stderr
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
fail_if_missing_pytest_expect()
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_configure(config):
|
||||||
|
msgs = []
|
||||||
|
|
||||||
|
if not os.path.exists(_testdata):
|
||||||
|
msg = "testdata not available! "
|
||||||
|
if os.path.exists(os.path.join(_root, ".git")):
|
||||||
|
msg += ("Please run git submodule update --init --recursive " +
|
||||||
|
"and then run tests again.")
|
||||||
|
else:
|
||||||
|
msg += ("The testdata doesn't appear to be included with this package, " +
|
||||||
|
"so finding the right version will be hard. :(")
|
||||||
|
msgs.append(msg)
|
||||||
|
|
||||||
|
if config.option.update_xfail:
|
||||||
|
# Check for optional requirements
|
||||||
|
req_file = os.path.join(_root, "requirements-optional.txt")
|
||||||
|
if os.path.exists(req_file):
|
||||||
|
with open(req_file, "r") as fp:
|
||||||
|
for line in fp:
|
||||||
|
if (line.strip() and
|
||||||
|
not (line.startswith("-r") or
|
||||||
|
line.startswith("#"))):
|
||||||
|
if ";" in line:
|
||||||
|
spec, marker = line.strip().split(";", 1)
|
||||||
|
else:
|
||||||
|
spec, marker = line.strip(), None
|
||||||
|
req = pkg_resources.Requirement.parse(spec)
|
||||||
|
if marker and not pkg_resources.evaluate_marker(marker):
|
||||||
|
msgs.append("%s not available in this environment" % spec)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
installed = pkg_resources.working_set.find(req)
|
||||||
|
except pkg_resources.VersionConflict:
|
||||||
|
msgs.append("Outdated version of %s installed, need %s" % (req.name, spec))
|
||||||
|
else:
|
||||||
|
if not installed:
|
||||||
|
msgs.append("Need %s" % spec)
|
||||||
|
|
||||||
|
# Check cElementTree
|
||||||
|
import xml.etree.ElementTree as ElementTree
|
||||||
|
|
||||||
|
try:
|
||||||
|
import xml.etree.cElementTree as cElementTree
|
||||||
|
except ImportError:
|
||||||
|
msgs.append("cElementTree unable to be imported")
|
||||||
|
else:
|
||||||
|
if cElementTree.Element is ElementTree.Element:
|
||||||
|
msgs.append("cElementTree is just an alias for ElementTree")
|
||||||
|
|
||||||
|
if msgs:
|
||||||
|
pytest.exit("\n".join(msgs))
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_collect_file(path, parent):
|
||||||
|
dir = os.path.abspath(path.dirname)
|
||||||
|
dir_and_parents = set()
|
||||||
|
while dir not in dir_and_parents:
|
||||||
|
dir_and_parents.add(dir)
|
||||||
|
dir = os.path.dirname(dir)
|
||||||
|
|
||||||
|
if _tree_construction in dir_and_parents:
|
||||||
|
if path.ext == ".dat":
|
||||||
|
return TreeConstructionFile(path, parent)
|
||||||
|
elif _tokenizer in dir_and_parents:
|
||||||
|
if path.ext == ".test":
|
||||||
|
return TokenizerFile(path, parent)
|
||||||
|
elif _sanitizer_testdata in dir_and_parents:
|
||||||
|
if path.ext == ".dat":
|
||||||
|
return SanitizerFile(path, parent)
|
433
testing/web-platform/tests/tools/third_party/html5lib/html5lib/tests/sanitizer-testdata/tests1.dat
поставляемый
Normal file
433
testing/web-platform/tests/tools/third_party/html5lib/html5lib/tests/sanitizer-testdata/tests1.dat
поставляемый
Normal file
|
@ -0,0 +1,433 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"name": "IE_Comments",
|
||||||
|
"input": "<!--[if gte IE 4]><script>alert('XSS');</script><![endif]-->",
|
||||||
|
"output": ""
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "IE_Comments_2",
|
||||||
|
"input": "<![if !IE 5]><script>alert('XSS');</script><![endif]>",
|
||||||
|
"output": "<script>alert('XSS');</script>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "allow_colons_in_path_component",
|
||||||
|
"input": "<a href=\"./this:that\">foo</a>",
|
||||||
|
"output": "<a href='./this:that'>foo</a>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "background_attribute",
|
||||||
|
"input": "<div background=\"javascript:alert('XSS')\"></div>",
|
||||||
|
"output": "<div></div>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "bgsound",
|
||||||
|
"input": "<bgsound src=\"javascript:alert('XSS');\" />",
|
||||||
|
"output": "<bgsound src=\"javascript:alert('XSS');\"></bgsound>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "div_background_image_unicode_encoded",
|
||||||
|
"input": "<div style=\"background-image:\u00a5\u00a2\u006C\u0028'\u006a\u0061\u00a6\u0061\u00a3\u0063\u00a2\u0069\u00a0\u00a4\u003a\u0061\u006c\u0065\u00a2\u00a4\u0028.1027\u0058.1053\u0053\u0027\u0029'\u0029\">foo</div>",
|
||||||
|
"output": "<div style=''>foo</div>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "div_expression",
|
||||||
|
"input": "<div style=\"width: expression(alert('XSS'));\">foo</div>",
|
||||||
|
"output": "<div style=''>foo</div>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "double_open_angle_brackets",
|
||||||
|
"input": "<img src=http://ha.ckers.org/scriptlet.html <",
|
||||||
|
"output": ""
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "double_open_angle_brackets_2",
|
||||||
|
"input": "<script src=http://ha.ckers.org/scriptlet.html <",
|
||||||
|
"output": ""
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "grave_accents",
|
||||||
|
"input": "<img src=`javascript:alert('XSS')` />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "img_dynsrc_lowsrc",
|
||||||
|
"input": "<img dynsrc=\"javascript:alert('XSS')\" />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "img_vbscript",
|
||||||
|
"input": "<img src='vbscript:msgbox(\"XSS\")' />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "input_image",
|
||||||
|
"input": "<input type=\"image\" src=\"javascript:alert('XSS');\" />",
|
||||||
|
"output": "<input type='image'/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "link_stylesheets",
|
||||||
|
"input": "<link rel=\"stylesheet\" href=\"javascript:alert('XSS');\" />",
|
||||||
|
"output": "<link href=\"javascript:alert('XSS');\" rel=\"stylesheet\">"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "link_stylesheets_2",
|
||||||
|
"input": "<link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\" />",
|
||||||
|
"output": "<link href=\"http://ha.ckers.org/xss.css\" rel=\"stylesheet\">"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "list_style_image",
|
||||||
|
"input": "<li style=\"list-style-image: url(javascript:alert('XSS'))\">foo</li>",
|
||||||
|
"output": "<li style=''>foo</li>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "no_closing_script_tags",
|
||||||
|
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
|
||||||
|
"output": "<script src=\"http://ha.ckers.org/xss.js?&lt;b\"></script>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "non_alpha_non_digit",
|
||||||
|
"input": "<script/XSS src=\"http://ha.ckers.org/xss.js\"></script>",
|
||||||
|
"output": "<script src=\"http://ha.ckers.org/xss.js\" xss=\"\"></script>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "non_alpha_non_digit_2",
|
||||||
|
"input": "<a onclick!\\#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>foo</a>",
|
||||||
|
"output": "<a>foo</a>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "non_alpha_non_digit_3",
|
||||||
|
"input": "<img/src=\"http://ha.ckers.org/xss.js\"/>",
|
||||||
|
"output": "<img src='http://ha.ckers.org/xss.js'/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "non_alpha_non_digit_II",
|
||||||
|
"input": "<a href!\\#$%&()*~+-_.,:;?@[/|]^`=alert('XSS')>foo</a>",
|
||||||
|
"output": "<a>foo</a>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "non_alpha_non_digit_III",
|
||||||
|
"input": "<a/href=\"javascript:alert('XSS');\">foo</a>",
|
||||||
|
"output": "<a>foo</a>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "platypus",
|
||||||
|
"input": "<a href=\"http://www.ragingplatypus.com/\" style=\"display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;\">never trust your upstream platypus</a>",
|
||||||
|
"output": "<a href='http://www.ragingplatypus.com/' style='display: block; width: 100%; height: 100%; background-color: black; background-x: center; background-y: center;'>never trust your upstream platypus</a>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "protocol_resolution_in_script_tag",
|
||||||
|
"input": "<script src=//ha.ckers.org/.j></script>",
|
||||||
|
"output": "<script src=\"//ha.ckers.org/.j\"></script>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_allow_anchors",
|
||||||
|
"input": "<a href='foo' onclick='bar'><script>baz</script></a>",
|
||||||
|
"output": "<a href='foo'><script>baz</script></a>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_allow_image_alt_attribute",
|
||||||
|
"input": "<img alt='foo' onclick='bar' />",
|
||||||
|
"output": "<img alt='foo'/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_allow_image_height_attribute",
|
||||||
|
"input": "<img height='foo' onclick='bar' />",
|
||||||
|
"output": "<img height='foo'/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_allow_image_src_attribute",
|
||||||
|
"input": "<img src='foo' onclick='bar' />",
|
||||||
|
"output": "<img src='foo'/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_allow_image_width_attribute",
|
||||||
|
"input": "<img width='foo' onclick='bar' />",
|
||||||
|
"output": "<img width='foo'/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_handle_blank_text",
|
||||||
|
"input": "",
|
||||||
|
"output": ""
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_handle_malformed_image_tags",
|
||||||
|
"input": "<img \"\"\"><script>alert(\"XSS\")</script>\">",
|
||||||
|
"output": "<img/><script>alert(\"XSS\")</script>\">"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_handle_non_html",
|
||||||
|
"input": "abc",
|
||||||
|
"output": "abc"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_not_fall_for_ridiculous_hack",
|
||||||
|
"input": "<img\nsrc\n=\n\"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n\"\n />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_not_fall_for_xss_image_hack_0",
|
||||||
|
"input": "<img src=\"javascript:alert('XSS');\" />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_not_fall_for_xss_image_hack_1",
|
||||||
|
"input": "<img src=javascript:alert('XSS') />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_not_fall_for_xss_image_hack_10",
|
||||||
|
"input": "<img src=\"jav
ascript:alert('XSS');\" />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_not_fall_for_xss_image_hack_11",
|
||||||
|
"input": "<img src=\"jav
ascript:alert('XSS');\" />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_not_fall_for_xss_image_hack_12",
|
||||||
|
"input": "<img src=\"  javascript:alert('XSS');\" />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_not_fall_for_xss_image_hack_13",
|
||||||
|
"input": "<img src=\" javascript:alert('XSS');\" />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_not_fall_for_xss_image_hack_14",
|
||||||
|
"input": "<img src=\" javascript:alert('XSS');\" />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_not_fall_for_xss_image_hack_2",
|
||||||
|
"input": "<img src=\"JaVaScRiPt:alert('XSS')\" />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_not_fall_for_xss_image_hack_3",
|
||||||
|
"input": "<img src='javascript:alert("XSS")' />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_not_fall_for_xss_image_hack_4",
|
||||||
|
"input": "<img src='javascript:alert(String.fromCharCode(88,83,83))' />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_not_fall_for_xss_image_hack_5",
|
||||||
|
"input": "<img src='javascript:alert('XSS')' />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_not_fall_for_xss_image_hack_6",
|
||||||
|
"input": "<img src='javascript:alert('XSS')' />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_not_fall_for_xss_image_hack_7",
|
||||||
|
"input": "<img src='javascript:alert('XSS')' />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_not_fall_for_xss_image_hack_8",
|
||||||
|
"input": "<img src=\"jav\tascript:alert('XSS');\" />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_not_fall_for_xss_image_hack_9",
|
||||||
|
"input": "<img src=\"jav	ascript:alert('XSS');\" />",
|
||||||
|
"output": "<img/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_sanitize_half_open_scripts",
|
||||||
|
"input": "<img src=\"javascript:alert('XSS')\"",
|
||||||
|
"output": ""
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_sanitize_invalid_script_tag",
|
||||||
|
"input": "<script/XSS SRC=\"http://ha.ckers.org/xss.js\"></script>",
|
||||||
|
"output": "<script src=\"http://ha.ckers.org/xss.js\" xss=\"\"></script>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_sanitize_script_tag_with_multiple_open_brackets",
|
||||||
|
"input": "<<script>alert(\"XSS\");//<</script>",
|
||||||
|
"output": "<<script>alert(\"XSS\");//<</script>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_sanitize_script_tag_with_multiple_open_brackets_2",
|
||||||
|
"input": "<iframe src=http://ha.ckers.org/scriptlet.html\n<",
|
||||||
|
"output": ""
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_sanitize_tag_broken_up_by_null",
|
||||||
|
"input": "<scr\u0000ipt>alert(\"XSS\")</scr\u0000ipt>",
|
||||||
|
"output": "<scr\ufffdipt>alert(\"XSS\")</scr\ufffdipt>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_sanitize_unclosed_script",
|
||||||
|
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
|
||||||
|
"output": "<script src=\"http://ha.ckers.org/xss.js?&lt;b\"></script>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_strip_href_attribute_in_a_with_bad_protocols",
|
||||||
|
"input": "<a href=\"javascript:XSS\" title=\"1\">boo</a>",
|
||||||
|
"output": "<a title='1'>boo</a>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_strip_href_attribute_in_a_with_bad_protocols_and_whitespace",
|
||||||
|
"input": "<a href=\" javascript:XSS\" title=\"1\">boo</a>",
|
||||||
|
"output": "<a title='1'>boo</a>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_strip_src_attribute_in_img_with_bad_protocols",
|
||||||
|
"input": "<img src=\"javascript:XSS\" title=\"1\">boo</img>",
|
||||||
|
"output": "<img title='1'/>boo"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "should_strip_src_attribute_in_img_with_bad_protocols_and_whitespace",
|
||||||
|
"input": "<img src=\" javascript:XSS\" title=\"1\">boo</img>",
|
||||||
|
"output": "<img title='1'/>boo"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "xml_base",
|
||||||
|
"input": "<div xml:base=\"javascript:alert('XSS');//\">foo</div>",
|
||||||
|
"output": "<div>foo</div>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "xul",
|
||||||
|
"input": "<p style=\"-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')\">fubar</p>",
|
||||||
|
"output": "<p style=''>fubar</p>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "quotes_in_attributes",
|
||||||
|
"input": "<img src='foo' title='\"foo\" bar' />",
|
||||||
|
"output": "<img src='foo' title='\"foo\" bar'/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "uri_refs_in_svg_attributes",
|
||||||
|
"input": "<svg><rect fill='url(#foo)' />",
|
||||||
|
"output": "<svg><rect fill='url(#foo)'></rect></svg>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "absolute_uri_refs_in_svg_attributes",
|
||||||
|
"input": "<svg><rect fill='url(http://bad.com/) #fff' />",
|
||||||
|
"output": "<svg><rect fill=' #fff'></rect></svg>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "uri_ref_with_space_in svg_attribute",
|
||||||
|
"input": "<svg><rect fill='url(\n#foo)' />",
|
||||||
|
"output": "<svg><rect fill='url(\n#foo)'></rect></svg>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "absolute_uri_ref_with_space_in svg_attribute",
|
||||||
|
"input": "<svg><rect fill=\"url(\nhttp://bad.com/)\" />",
|
||||||
|
"output": "<svg><rect fill=' '></rect></svg>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "allow_html5_image_tag",
|
||||||
|
"input": "<image src='foo' />",
|
||||||
|
"output": "<img src='foo'/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "style_attr_end_with_nothing",
|
||||||
|
"input": "<div style=\"color: blue\" />",
|
||||||
|
"output": "<div style='color: blue;'></div>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "style_attr_end_with_space",
|
||||||
|
"input": "<div style=\"color: blue \" />",
|
||||||
|
"output": "<div style='color: blue ;'></div>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "style_attr_end_with_semicolon",
|
||||||
|
"input": "<div style=\"color: blue;\" />",
|
||||||
|
"output": "<div style='color: blue;'></div>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "style_attr_end_with_semicolon_space",
|
||||||
|
"input": "<div style=\"color: blue; \" />",
|
||||||
|
"output": "<div style='color: blue;'></div>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "attributes_with_embedded_quotes",
|
||||||
|
"input": "<img src=doesntexist.jpg\"'onerror=\"alert(1) />",
|
||||||
|
"output": "<img src='doesntexist.jpg\"'onerror=\"alert(1)'/>"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "attributes_with_embedded_quotes_II",
|
||||||
|
"input": "<img src=notthere.jpg\"\"onerror=\"alert(2) />",
|
||||||
|
"output": "<img src='notthere.jpg\"\"onerror=\"alert(2)'/>"
|
||||||
|
}
|
||||||
|
]
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче