зеркало из https://github.com/mozilla/ispdb.git
Bug 565678 - Adapt quickparse.py (ISPDB statistics generator) to DNS MX. r=gozer,benb
git-svn-id: http://svn.mozilla.org/mozillamessaging.com/sites/ispdb.mozillamessaging.com/trunk@73346 4eb1ac78-321c-0410-a911-ec516a8615a5
This commit is contained in:
Родитель
2faf917f88
Коммит
0240d7b766
|
@ -0,0 +1,90 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
# Copyright (c) 2009 Michael Still
|
||||
# Released under the terms of the GNU GPL v2
|
||||
|
||||
# Mozilla publishes a rule file which may be used to calculate effective TLDs
|
||||
# at:
|
||||
#
|
||||
# http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/src/
|
||||
# effective_tld_names.dat?raw=1
|
||||
#
|
||||
# Use that file to take a domain name and return a (domain, etld) tuple.
|
||||
# Documentation for the rule file format is at:
|
||||
#
|
||||
# https://wiki.mozilla.org/Gecko:Effective_TLD_Service
|
||||
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
|
||||
class EtldException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class etld(object):
|
||||
"""Helper to determine the effective TLD portion of a domain name."""
|
||||
|
||||
def __init__(self, datafile='effective_tld_names.dat'):
|
||||
"""Load the data file ready for lookups."""
|
||||
|
||||
self.rules = {}
|
||||
|
||||
file = open(datafile)
|
||||
line = file.readline()
|
||||
while line:
|
||||
line = line.rstrip()
|
||||
if line and not line.startswith('//'):
|
||||
tld = line.split('.')[-1]
|
||||
self.rules.setdefault(tld, [])
|
||||
self.rules[tld].append(re.compile(self.regexpize(line)))
|
||||
|
||||
line = file.readline()
|
||||
file.close()
|
||||
|
||||
def regexpize(self, line):
|
||||
"""Convert a rule to regexp syntax."""
|
||||
|
||||
line = line[::-1].replace('.', '\\.').replace('*', '[^\\.]*').replace('!', '')
|
||||
return '^(%s)\.(.*)$' % line
|
||||
|
||||
def parse(self, hostname):
|
||||
"""Parse a hostanme into domain and etld portions."""
|
||||
|
||||
hostname = hostname.lower()
|
||||
tld = hostname.split('.')[-1]
|
||||
hostname = hostname[::-1]
|
||||
domain = ''
|
||||
etld = ''
|
||||
|
||||
if tld in self.rules:
|
||||
for rule in self.rules[tld]:
|
||||
m = rule.match(hostname)
|
||||
if m and m.group(1) > etld:
|
||||
domain = m.group(2)[::-1]
|
||||
etld = m.group(1)[::-1]
|
||||
|
||||
if not etld:
|
||||
raise EtldException('Parse failed')
|
||||
|
||||
return (domain, etld)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
e = etld()
|
||||
|
||||
f = open(sys.argv[1])
|
||||
l = f.readline()
|
||||
start_time = time.time()
|
||||
|
||||
while l:
|
||||
try:
|
||||
l = l.rstrip()
|
||||
print '%s -> %s' %(l, e.parse(l))
|
||||
except Exception, ex:
|
||||
print ex
|
||||
|
||||
l = f.readline()
|
||||
|
||||
print 'Took %f seconds' % (time.time() - start_time)
|
||||
f.close()
|
|
@ -1,5 +1,9 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datetime import date, timedelta
|
||||
import DNS
|
||||
import etld # <http://www.stillhq.com/python/etld/etld.py>
|
||||
from optparse import OptionParser
|
||||
import os
|
||||
import pickle
|
||||
|
@ -9,33 +13,32 @@ import sys
|
|||
|
||||
# Constants.
|
||||
|
||||
ASSIGNED_DOMAINS = {
|
||||
"mail.ru": 493758,
|
||||
"yandex.ru": 493758,
|
||||
"rambler.ru": 493758,
|
||||
}
|
||||
|
||||
|
||||
# Utility functions.
|
||||
|
||||
def readPrevious(pickleName):
|
||||
"""Read the previous data from the specified pickle."""
|
||||
prevHits = None
|
||||
prevMisses = None
|
||||
mxs = {}
|
||||
if os.path.exists(pickleName):
|
||||
data = open(pickleName)
|
||||
prevHits = pickle.load(data)
|
||||
prevMisses = pickle.load(data)
|
||||
return prevHits, prevMisses
|
||||
try:
|
||||
mxs = pickle.load(data)
|
||||
except EOFError:
|
||||
pass
|
||||
return prevHits, prevMisses, mxs
|
||||
|
||||
def writeNext(pickleName, hits, misses):
|
||||
def writeNext(pickleName, hits, misses, mxs):
|
||||
"""Write the current data to the specified pickle."""
|
||||
out = open(pickleName, "w")
|
||||
pickle.dump(hits, out, -1)
|
||||
pickle.dump(misses, out, -1)
|
||||
pickle.dump(mxs, out, -1)
|
||||
out.close()
|
||||
|
||||
def gatherData(files):
|
||||
def gatherData(files, mxs):
|
||||
"""Gather all the data."""
|
||||
domains = []
|
||||
domain2count = {}
|
||||
|
@ -59,14 +62,26 @@ def gatherData(files):
|
|||
domain2count[(domain,code)] = 1
|
||||
for domain in domains:
|
||||
domain["count"] = domain2count[(domain["domain"],domain["code"])]
|
||||
ip_histogram = {}
|
||||
|
||||
# So, now we've got all the lines, but some of the failures will actually
|
||||
# be hits on the MX, so let's try to remove those.
|
||||
domainsDict = dictify(domains)
|
||||
mx_hits = [];
|
||||
for domain in domains:
|
||||
if domain["code"] == "404":
|
||||
# We've got a missing domain, so let's check for the MX record.
|
||||
mx = getMX(mxs, domain["domain"])
|
||||
if mx and domainsDict.has_key(mx):
|
||||
mx_hits.append(domain["count"])
|
||||
domains.remove(domain)
|
||||
|
||||
ip_histogram = {}
|
||||
for ip, count in countsperIP.items():
|
||||
ip_histogram[count] = ip_histogram.setdefault(count, 0) + 1
|
||||
|
||||
counts = ip_histogram.keys()
|
||||
counts.sort()
|
||||
return domains, counts, ip_histogram
|
||||
return domains, counts, mx_hits, ip_histogram
|
||||
|
||||
def dictify(data):
|
||||
"""Change a list of domains, counts, and codes into a dict of the same."""
|
||||
|
@ -100,6 +115,35 @@ def calculateDiffs(prevData, data):
|
|||
retval.sort(rank_by_count)
|
||||
return retval
|
||||
|
||||
etldService = etld.etld()
|
||||
|
||||
def getSLD(domain):
|
||||
"""Get the "second level domain", e.g. "mozilla.org" or "bbc.co.uk" """
|
||||
try:
|
||||
sp = etldService.parse(domain) # returns ("5.4.bbc", "co.uk")
|
||||
sld = sp[0].rsplit(".", 1)[-1]
|
||||
tld = sp[1]
|
||||
return sld + "." + tld
|
||||
except etld.EtldException:
|
||||
return domain
|
||||
|
||||
|
||||
def getMX(mxs, name):
|
||||
""" You pass in domain |name| and it returns the hostname of the MX server.
|
||||
It either uses the cache |mxs| or does a lookup via DNS over the Internet
|
||||
(and populates the cache)."""
|
||||
if name not in mxs:
|
||||
possible_mxs = []
|
||||
try:
|
||||
possible_mxs = DNS.mxlookup(name.encode("utf-8"))
|
||||
except DNS.DNSError:
|
||||
pass
|
||||
except UnicodeError:
|
||||
pass
|
||||
if len(possible_mxs) < 1:
|
||||
possible_mxs = [None]
|
||||
mxs[name] = possible_mxs[0]
|
||||
return mxs[name]
|
||||
|
||||
class Usage(Exception):
|
||||
def __init__(self, msg):
|
||||
|
@ -112,48 +156,63 @@ def main(argv=None):
|
|||
if argv is None:
|
||||
argv = sys.argv
|
||||
|
||||
parser = OptionParser()
|
||||
usage = """%prog [options] logfile [...]
|
||||
logfile Apache logfile"""
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-p", "--previous", dest="previous",
|
||||
default="./quickparse.pickle",
|
||||
help="Where to get the previous data. "
|
||||
"%default by default.")
|
||||
default=None, #set below
|
||||
help="Where to get the cache data of the previous run"
|
||||
"of this script esp. DNS MX lookups, which can"
|
||||
"take hours. <filename-1>.pickle by default.")
|
||||
parser.add_option("-n", "--next", dest="next",
|
||||
default=None,
|
||||
help="Where to write the new data. "
|
||||
"<filename>.pickle by default.")
|
||||
(options, args) = parser.parse_args()
|
||||
default=None, # set below
|
||||
help="Where to write the new cache data of this run of"
|
||||
"this script. <filename>.pickle by default.")
|
||||
(options, logfiles) = parser.parse_args()
|
||||
|
||||
if len(logfiles) < 1:
|
||||
parser.print_usage()
|
||||
exit(1)
|
||||
|
||||
if not options.next:
|
||||
options.next = os.path.splitext(args[0])[0]+".pickle"
|
||||
options.next = os.path.splitext(logfiles[0])[0]+".pickle"
|
||||
if not options.previous:
|
||||
try:
|
||||
name = os.path.splitext(logfiles[0])[0]
|
||||
d = date(int(name[0:4]), int(name[4:6]), int(name[6:8]))
|
||||
d -= timedelta(days=1)
|
||||
name = d.strftime("%Y%m%d")
|
||||
options.previous = name+".pickle"
|
||||
except:
|
||||
options.previous = options.next
|
||||
|
||||
prevHits, prevMisses = readPrevious(options.previous)
|
||||
prevHits, prevMisses, mxs = readPrevious(options.previous)
|
||||
|
||||
domains, counts, ip_histogram = gatherData(args)
|
||||
domains, counts, mx_hits, ip_histogram = gatherData(logfiles, mxs)
|
||||
|
||||
print "# of requests per single IP:"
|
||||
for c in counts[:9]:
|
||||
print c, ip_histogram[c]
|
||||
print "%4d %d" %(c, ip_histogram[c])
|
||||
if len(counts) > 9:
|
||||
print counts[9], "and more:", sum([ip_histogram[i] for i in counts[9:]])
|
||||
print "%3d+" %(counts[9],), sum([ip_histogram[i] for i in counts[9:]])
|
||||
|
||||
hits = dictify(d for d in domains if d["code"] in ("200","304"))
|
||||
misses = dictify(d for d in domains if d["code"] == "404"
|
||||
and d["domain"] not in ASSIGNED_DOMAINS)
|
||||
pending = dictify(d for d in domains if d["code"] == "404"
|
||||
and d["domain"] in ASSIGNED_DOMAINS)
|
||||
misses = dictify(d for d in domains if d["code"] == "404")
|
||||
weirdos = sorted(d for d in domains if d["code"] not in ("200","304","404"))
|
||||
|
||||
miss_total = sum(misses.values())
|
||||
pending_total = sum(pending.values())
|
||||
weirdo_total = sum(x["count"] for x in weirdos)
|
||||
hit_total = sum(hits.values())
|
||||
total_queries = miss_total + hit_total + pending_total + weirdo_total
|
||||
mx_total = sum(mx_hits)
|
||||
total_queries = miss_total + hit_total + weirdo_total
|
||||
if total_queries != sum([d["count"] for d in domains]):
|
||||
print "Error: total_queries (%d) != sum of domain counts (%d)." % (
|
||||
total_queries, sum([d["count"] for d in domains]))
|
||||
|
||||
|
||||
print "HITS: %d domains, accounting for %d successes, or %3.1f%% success rate" % (len(hits), hit_total, 100.*hit_total/total_queries)
|
||||
print " MX: %d domains, accounting for %d hits." % (len(mx_hits), mx_total)
|
||||
print "MISSES: %d domains, accounting for %d failures, or %3.1f%% fail rate" % (len(misses), miss_total, 100.*miss_total/total_queries)
|
||||
print "PENDING: %d domains, accounting for %d failures, or %3.1f%% fail rate" % (len(pending), pending_total, 100.*pending_total/total_queries)
|
||||
print "WEIRDOS: %d domains, accounting for %d oddities, or %3.1f%% strangeness rate" % (len(weirdos), weirdo_total, 100.*weirdo_total/total_queries)
|
||||
print "\n".join(" %(domain)s (%(count)s hits, returned %(code)s)" % x for x in weirdos)
|
||||
print
|
||||
|
@ -193,7 +252,7 @@ def main(argv=None):
|
|||
printDetails(prevMisses[:10], total_queries, "+")
|
||||
print
|
||||
|
||||
writeNext(options.next, hits, misses)
|
||||
writeNext(options.next, hits, misses, mxs)
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Загрузка…
Ссылка в новой задаче