2012-05-21 15:12:37 +04:00
|
|
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
2008-02-15 01:57:20 +03:00
|
|
|
|
|
|
|
import codecs
|
|
|
|
import encodings.idna
|
2017-07-18 02:10:18 +03:00
|
|
|
import imp
|
|
|
|
import os
|
2008-02-15 01:57:20 +03:00
|
|
|
import re
|
|
|
|
import sys
|
2018-08-01 01:20:18 +03:00
|
|
|
from make_dafsa import words_to_cxx
|
2008-02-15 01:57:20 +03:00
|
|
|
|
|
|
|
"""
|
|
|
|
Processes a file containing effective TLD data. See the following URL for a
|
|
|
|
description of effective TLDs and of the file format that this script
|
|
|
|
processes (although for the latter you're better off just reading this file's
|
|
|
|
short source code).
|
|
|
|
|
|
|
|
http://wiki.mozilla.org/Gecko:Effective_TLD_Service
|
|
|
|
"""
|
|
|
|
|
|
|
|
def getEffectiveTLDs(path):
|
|
|
|
file = codecs.open(path, "r", "UTF-8")
|
2016-02-25 05:31:01 +03:00
|
|
|
entries = []
|
2009-07-17 18:20:11 +04:00
|
|
|
domains = set()
|
2016-02-25 05:31:01 +03:00
|
|
|
for line in file:
|
2008-02-15 01:57:20 +03:00
|
|
|
# line always contains a line terminator unless the file is empty
|
|
|
|
if len(line) == 0:
|
|
|
|
raise StopIteration
|
|
|
|
line = line.rstrip()
|
|
|
|
# comment, empty, or superfluous line for explicitness purposes
|
|
|
|
if line.startswith("//") or "." not in line:
|
|
|
|
continue
|
|
|
|
line = re.split(r"[ \t\n]", line, 1)[0]
|
|
|
|
entry = EffectiveTLDEntry(line)
|
|
|
|
domain = entry.domain()
|
|
|
|
assert domain not in domains, \
|
|
|
|
"repeating domain %s makes no sense" % domain
|
|
|
|
domains.add(domain)
|
2017-07-18 02:10:18 +03:00
|
|
|
yield entry
|
2008-02-15 01:57:20 +03:00
|
|
|
|
|
|
|
def _normalizeHostname(domain):
|
|
|
|
"""
|
|
|
|
Normalizes the given domain, component by component. ASCII components are
|
|
|
|
lowercased, while non-ASCII components are processed using the ToASCII
|
|
|
|
algorithm.
|
|
|
|
"""
|
|
|
|
def convertLabel(label):
|
|
|
|
if _isASCII(label):
|
|
|
|
return label.lower()
|
|
|
|
return encodings.idna.ToASCII(label)
|
|
|
|
return ".".join(map(convertLabel, domain.split(".")))
|
|
|
|
|
|
|
|
def _isASCII(s):
|
|
|
|
"True if s consists entirely of ASCII characters, false otherwise."
|
|
|
|
for c in s:
|
|
|
|
if ord(c) > 127:
|
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
|
|
class EffectiveTLDEntry:
|
|
|
|
"""
|
|
|
|
Stores an entry in an effective-TLD name file.
|
|
|
|
"""
|
|
|
|
|
|
|
|
_exception = False
|
|
|
|
_wild = False
|
|
|
|
|
|
|
|
def __init__(self, line):
|
|
|
|
"""
|
|
|
|
Creates a TLD entry from a line of data, which must have been stripped of
|
|
|
|
the line ending.
|
|
|
|
"""
|
|
|
|
if line.startswith("!"):
|
|
|
|
self._exception = True
|
|
|
|
domain = line[1:]
|
|
|
|
elif line.startswith("*."):
|
|
|
|
self._wild = True
|
|
|
|
domain = line[2:]
|
|
|
|
else:
|
|
|
|
domain = line
|
|
|
|
self._domain = _normalizeHostname(domain)
|
|
|
|
|
|
|
|
def domain(self):
|
|
|
|
"The domain this represents."
|
|
|
|
return self._domain
|
|
|
|
|
|
|
|
def exception(self):
|
|
|
|
"True if this entry's domain denotes does not denote an effective TLD."
|
|
|
|
return self._exception
|
|
|
|
|
|
|
|
def wild(self):
|
|
|
|
"True if this entry represents a class of effective TLDs."
|
|
|
|
return self._wild
|
|
|
|
|
|
|
|
|
|
|
|
#################
|
|
|
|
# DO EVERYTHING #
|
|
|
|
#################
|
|
|
|
|
2014-12-16 23:15:14 +03:00
|
|
|
def main(output, effective_tld_filename):
|
2008-02-15 01:57:20 +03:00
|
|
|
"""
|
2014-12-16 23:15:14 +03:00
|
|
|
effective_tld_filename is the effective TLD file to parse.
|
2017-07-18 02:10:18 +03:00
|
|
|
A C++ array of a binary representation of a DAFSA representing the
|
2014-12-16 23:15:14 +03:00
|
|
|
eTLD file is then printed to output.
|
2008-02-15 01:57:20 +03:00
|
|
|
"""
|
|
|
|
|
2017-07-18 02:10:18 +03:00
|
|
|
def typeEnum(etld):
|
|
|
|
"""
|
|
|
|
Maps the flags to the DAFSA's enum types.
|
|
|
|
"""
|
|
|
|
if etld.exception():
|
|
|
|
return 1
|
|
|
|
elif etld.wild():
|
|
|
|
return 2
|
|
|
|
else:
|
|
|
|
return 0
|
|
|
|
|
|
|
|
def dafsa_words():
|
|
|
|
"""
|
|
|
|
make_dafsa expects lines of the form "<domain_name><enum_value>"
|
|
|
|
"""
|
|
|
|
for etld in getEffectiveTLDs(effective_tld_filename):
|
|
|
|
yield "%s%d" % (etld.domain(), typeEnum(etld))
|
2008-02-15 01:57:20 +03:00
|
|
|
|
2018-08-01 01:20:18 +03:00
|
|
|
output.write(words_to_cxx(dafsa_words()))
|
2015-04-01 09:50:00 +03:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main(sys.stdout, sys.argv[1])
|