Bug 565678 - Adapt quickparse.py (ISPDB statistics generator) to DNS MX. r=gozer,benb

git-svn-id: http://svn.mozilla.org/mozillamessaging.com/sites/ispdb.mozillamessaging.com/trunk@73346 4eb1ac78-321c-0410-a911-ec516a8615a5
This commit is contained in:
Blake Winton 2010-08-30 17:19:19 +00:00 коммит произвёл Andrei Hajdukewycz
Родитель 2faf917f88
Коммит 0240d7b766
2 изменённых файлов: 182 добавлений и 33 удалений

90
tools/etld.py Normal file
Просмотреть файл

@ -0,0 +1,90 @@
#!/usr/bin/python
# Copyright (c) 2009 Michael Still
# Released under the terms of the GNU GPL v2
# Mozilla publishes a rule file which may be used to calculate effective TLDs
# at:
#
# http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/src/
# effective_tld_names.dat?raw=1
#
# Use that file to take a domain name and return a (domain, etld) tuple.
# Documentation for the rule file format is at:
#
# https://wiki.mozilla.org/Gecko:Effective_TLD_Service
import re
import sys
import time
class EtldException(Exception):
pass
class etld(object):
"""Helper to determine the effective TLD portion of a domain name."""
def __init__(self, datafile='effective_tld_names.dat'):
"""Load the data file ready for lookups."""
self.rules = {}
file = open(datafile)
line = file.readline()
while line:
line = line.rstrip()
if line and not line.startswith('//'):
tld = line.split('.')[-1]
self.rules.setdefault(tld, [])
self.rules[tld].append(re.compile(self.regexpize(line)))
line = file.readline()
file.close()
def regexpize(self, line):
"""Convert a rule to regexp syntax."""
line = line[::-1].replace('.', '\\.').replace('*', '[^\\.]*').replace('!', '')
return '^(%s)\.(.*)$' % line
def parse(self, hostname):
"""Parse a hostanme into domain and etld portions."""
hostname = hostname.lower()
tld = hostname.split('.')[-1]
hostname = hostname[::-1]
domain = ''
etld = ''
if tld in self.rules:
for rule in self.rules[tld]:
m = rule.match(hostname)
if m and m.group(1) > etld:
domain = m.group(2)[::-1]
etld = m.group(1)[::-1]
if not etld:
raise EtldException('Parse failed')
return (domain, etld)
if __name__ == '__main__':
e = etld()
f = open(sys.argv[1])
l = f.readline()
start_time = time.time()
while l:
try:
l = l.rstrip()
print '%s -> %s' %(l, e.parse(l))
except Exception, ex:
print ex
l = f.readline()
print 'Took %f seconds' % (time.time() - start_time)
f.close()

Просмотреть файл

@ -1,5 +1,9 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from datetime import date, timedelta
import DNS
import etld # <http://www.stillhq.com/python/etld/etld.py>
from optparse import OptionParser
import os
import pickle
@ -9,33 +13,32 @@ import sys
# Constants.
ASSIGNED_DOMAINS = {
"mail.ru": 493758,
"yandex.ru": 493758,
"rambler.ru": 493758,
}
# Utility functions.
def readPrevious(pickleName):
"""Read the previous data from the specified pickle."""
prevHits = None
prevMisses = None
mxs = {}
if os.path.exists(pickleName):
data = open(pickleName)
prevHits = pickle.load(data)
prevMisses = pickle.load(data)
return prevHits, prevMisses
try:
mxs = pickle.load(data)
except EOFError:
pass
return prevHits, prevMisses, mxs
def writeNext(pickleName, hits, misses):
def writeNext(pickleName, hits, misses, mxs):
"""Write the current data to the specified pickle."""
out = open(pickleName, "w")
pickle.dump(hits, out, -1)
pickle.dump(misses, out, -1)
pickle.dump(mxs, out, -1)
out.close()
def gatherData(files):
def gatherData(files, mxs):
"""Gather all the data."""
domains = []
domain2count = {}
@ -59,14 +62,26 @@ def gatherData(files):
domain2count[(domain,code)] = 1
for domain in domains:
domain["count"] = domain2count[(domain["domain"],domain["code"])]
ip_histogram = {}
# So, now we've got all the lines, but some of the failures will actually
# be hits on the MX, so let's try to remove those.
domainsDict = dictify(domains)
mx_hits = [];
for domain in domains:
if domain["code"] == "404":
# We've got a missing domain, so let's check for the MX record.
mx = getMX(mxs, domain["domain"])
if mx and domainsDict.has_key(mx):
mx_hits.append(domain["count"])
domains.remove(domain)
ip_histogram = {}
for ip, count in countsperIP.items():
ip_histogram[count] = ip_histogram.setdefault(count, 0) + 1
counts = ip_histogram.keys()
counts.sort()
return domains, counts, ip_histogram
return domains, counts, mx_hits, ip_histogram
def dictify(data):
"""Change a list of domains, counts, and codes into a dict of the same."""
@ -100,6 +115,35 @@ def calculateDiffs(prevData, data):
retval.sort(rank_by_count)
return retval
etldService = etld.etld()
def getSLD(domain):
"""Get the "second level domain", e.g. "mozilla.org" or "bbc.co.uk" """
try:
sp = etldService.parse(domain) # returns ("5.4.bbc", "co.uk")
sld = sp[0].rsplit(".", 1)[-1]
tld = sp[1]
return sld + "." + tld
except etld.EtldException:
return domain
def getMX(mxs, name):
""" You pass in domain |name| and it returns the hostname of the MX server.
It either uses the cache |mxs| or does a lookup via DNS over the Internet
(and populates the cache)."""
if name not in mxs:
possible_mxs = []
try:
possible_mxs = DNS.mxlookup(name.encode("utf-8"))
except DNS.DNSError:
pass
except UnicodeError:
pass
if len(possible_mxs) < 1:
possible_mxs = [None]
mxs[name] = possible_mxs[0]
return mxs[name]
class Usage(Exception):
def __init__(self, msg):
@ -112,48 +156,63 @@ def main(argv=None):
if argv is None:
argv = sys.argv
parser = OptionParser()
usage = """%prog [options] logfile [...]
logfile Apache logfile"""
parser = OptionParser(usage=usage)
parser.add_option("-p", "--previous", dest="previous",
default="./quickparse.pickle",
help="Where to get the previous data. "
"%default by default.")
default=None, #set below
help="Where to get the cache data of the previous run"
"of this script esp. DNS MX lookups, which can"
"take hours. <filename-1>.pickle by default.")
parser.add_option("-n", "--next", dest="next",
default=None,
help="Where to write the new data. "
"<filename>.pickle by default.")
(options, args) = parser.parse_args()
default=None, # set below
help="Where to write the new cache data of this run of"
"this script. <filename>.pickle by default.")
(options, logfiles) = parser.parse_args()
if len(logfiles) < 1:
parser.print_usage()
exit(1)
if not options.next:
options.next = os.path.splitext(args[0])[0]+".pickle"
options.next = os.path.splitext(logfiles[0])[0]+".pickle"
if not options.previous:
try:
name = os.path.splitext(logfiles[0])[0]
d = date(int(name[0:4]), int(name[4:6]), int(name[6:8]))
d -= timedelta(days=1)
name = d.strftime("%Y%m%d")
options.previous = name+".pickle"
except:
options.previous = options.next
prevHits, prevMisses = readPrevious(options.previous)
prevHits, prevMisses, mxs = readPrevious(options.previous)
domains, counts, ip_histogram = gatherData(args)
domains, counts, mx_hits, ip_histogram = gatherData(logfiles, mxs)
print "# of requests per single IP:"
for c in counts[:9]:
print c, ip_histogram[c]
print "%4d %d" %(c, ip_histogram[c])
if len(counts) > 9:
print counts[9], "and more:", sum([ip_histogram[i] for i in counts[9:]])
print "%3d+" %(counts[9],), sum([ip_histogram[i] for i in counts[9:]])
hits = dictify(d for d in domains if d["code"] in ("200","304"))
misses = dictify(d for d in domains if d["code"] == "404"
and d["domain"] not in ASSIGNED_DOMAINS)
pending = dictify(d for d in domains if d["code"] == "404"
and d["domain"] in ASSIGNED_DOMAINS)
misses = dictify(d for d in domains if d["code"] == "404")
weirdos = sorted(d for d in domains if d["code"] not in ("200","304","404"))
miss_total = sum(misses.values())
pending_total = sum(pending.values())
weirdo_total = sum(x["count"] for x in weirdos)
hit_total = sum(hits.values())
total_queries = miss_total + hit_total + pending_total + weirdo_total
mx_total = sum(mx_hits)
total_queries = miss_total + hit_total + weirdo_total
if total_queries != sum([d["count"] for d in domains]):
print "Error: total_queries (%d) != sum of domain counts (%d)." % (
total_queries, sum([d["count"] for d in domains]))
print "HITS: %d domains, accounting for %d successes, or %3.1f%% success rate" % (len(hits), hit_total, 100.*hit_total/total_queries)
print " MX: %d domains, accounting for %d hits." % (len(mx_hits), mx_total)
print "MISSES: %d domains, accounting for %d failures, or %3.1f%% fail rate" % (len(misses), miss_total, 100.*miss_total/total_queries)
print "PENDING: %d domains, accounting for %d failures, or %3.1f%% fail rate" % (len(pending), pending_total, 100.*pending_total/total_queries)
print "WEIRDOS: %d domains, accounting for %d oddities, or %3.1f%% strangeness rate" % (len(weirdos), weirdo_total, 100.*weirdo_total/total_queries)
print "\n".join(" %(domain)s (%(count)s hits, returned %(code)s)" % x for x in weirdos)
print
@ -193,7 +252,7 @@ def main(argv=None):
printDetails(prevMisses[:10], total_queries, "+")
print
writeNext(options.next, hits, misses)
writeNext(options.next, hits, misses, mxs)
return 0
if __name__ == "__main__":