Renamed test and moved scripts to zip archieve for ease of deployment

This commit is contained in:
Jonas Finnemann Jensen 2013-09-25 19:07:11 +02:00
Родитель 336f57764f
Коммит d62a79e6d8
4 изменённых файлов: 126 добавлений и 428 удалений

2
.gitignore поставляемый
Просмотреть файл

@ -1,5 +1,5 @@
*.pyc
dashboard.zip
histogram_tools.py
histogram_specs.py
specs.py

Просмотреть файл

@ -1,5 +1,5 @@
FILES= histogram_tools.py Histograms.json specs.py
download: $(FILES)
FILES= histogram_tools.py Histograms.json specs.py dashboard.zip
all: $(FILES)
Histograms.json:
wget -c http://hg.mozilla.org/mozilla-central/raw-file/tip/toolkit/components/telemetry/Histograms.json -O $@
@ -10,5 +10,8 @@ histogram_tools.py:
specs.py: Histograms.json
python specgen.py $< > $@
dashboard.zip: specs.py dashboard.py
zip $@ $?
clean:
rm -f $(FILES)

Просмотреть файл

@ -1,13 +1,23 @@
try:
import simplejson as json
print "Using simplejson for faster json parsing"
except ImportError:
import json
import sys
import math
import sys
# Import histogram specs and generated by makefile using specgen.py
import specs
verbose = True
# Counts number of times we've printed a log message
logMsgCount = {}
# Auxiliary method to write log messages
def log(msg, *args):
# We only print a log message the first 10 times we see it
n = logMsgCount.get(msg, 10)
if n > 0:
logMsgCount[msg] = n - 1
print >> sys.stderr, msg % args
# Auxiliary method for computing bucket offsets from parameters, it is stolen
# from histogram_tools.py, though slightly modified...
@ -44,35 +54,67 @@ simple_measures_buckets = (
exponential_buckets(1, 30000, 50)
)
# Cache of all output values, we do a linear scan so we won't write any thing
# until map_finished() gets called... This is hack that makes linear scans
# a lot faster :)
cache = {}
SPECS = "scripts/histogram_specs.json"
histogram_specs = json.loads(
jydoop.getResource(SPECS))
def map_finished(context):
log("map_finished outputing: %s keys", len(cache))
global cache
for key, value in cache.iteritems():
context.write(key, value)
cache = {}
def map(uid, line, context):
global histogram_specs
# Auxiliary function for aggregating a result to the cache, we pass in context
# so we can skip caching here should we ever want to do this
def write_to_cache(key, value, context):
global cache
cachedValue = cache.get(key, None)
if cachedValue is None:
cache[key] = value
else:
for filterPath, hgramValues in value.iteritems():
existing = cachedValue.get(filterPath, None)
if existing is None:
cachedValue[filterPath] = hgramValues
continue
for y in xrange(0, len(hgramValues)):
existing[y] += (hgramValues[y] or 0)
payload = json.loads(line)
# histogram incoming format:
# [
# bucket0, bucket1, ..., bucketN,
# sum, log_sum, log_sum_squares, sum_squares_lo, sum_squares_hi
# ]
# Aggregated histogram format:
# [
# bucket0, bucket1, ..., bucketN,
# sum, log_sum, log_sum_squares, sum_squares_lo, sum_squares_hi, count
# ]
# where count is the number of histograms aggregated in the histogram.
def map(key, dims, value, context):
# Unpack dimensions
reason, appName, channel, version, buildId, submissionDate = dims
# Get the major version
majorVersion = version.split('.')[0]
# Get the build date, ignore the rest of the buildId
buildDate = buildId[:8]
# Load JSON payload
payload = json.loads(value)
# Get OS, osVersion and architecture information
try:
i = payload['info']
channel = i.get('appUpdateChannel', "too_old")
OS = i['OS']
appName = i['appName']
reason = i['reason']
osVersion = str(i['version'])
#only care about major versions
appVersion = i['appVersion'].split('.')[0]
arch = i['arch']
buildDate = i['appBuildID'][:8]
info = payload['info']
OS = info['OS']
osVersion = str(info['version'])
arch = info['arch']
except (KeyError, IndexError, UnicodeEncodeError):
if verbose:
msg = "error while unpacking the payload"
print >> sys.stderr, msg
return
# TODO: histogram_specs should specify the list of versions/channels we
# care about
if not channel in ['release', 'aurora', 'nightly', 'beta', 'nightly-ux']:
log("error while unpacking the payload")
return
# todo combine OS + osVersion + santize on crazy platforms like linux to
@ -80,226 +122,87 @@ def map(uid, line, context):
if OS == "Linux":
osVersion = osVersion[:3]
path = (buildDate, reason, appName, OS, osVersion, arch)
# Sanitize path
for val in path:
if not isinstance(val, basestring) and type(val) in (int, float, long):
if verbose:
print >> sys.stderr, "Found type %s in path" % type(val)
return
# Create filter path
filterPath = (buildDate, reason, appName, OS, osVersion, arch)
# Sanitize channel and appVersion
for val in (channel, appVersion):
if not isinstance(val, basestring) and type(val) in (int, float, long):
if verbose:
print >> sys.stderr, ("Found type %s in channel or appVersion" %
type(val))
return
histograms = payload.get('histograms', None)
if histograms is None:
histograms = {}
if verbose:
msg = "histograms is None in map"
print >> sys.stderr, msg
for h_name, h_values in histograms.iteritems():
bucket2index = histogram_specs.get(h_name, None)
if bucket2index is None:
if verbose:
msg = "bucket2index is None in map"
print >> sys.stderr, msg
continue
else:
bucket2index = bucket2index[0]
# most buckets contain 0s, so preallocation is a significant win
outarray = [0] * (len(bucket2index) + 4)
index_error = False
type_error = False
if not isinstance(h_values, dict):
if verbose:
msg = "h_values is not a dictionary"
print >> sys.stderr, msg
continue
try:
values = h_values.get('values', None)
except AttributeError:
msg = "h_values was not a dict"
print >> sys.stderr, msg
return
if values is None:
continue
for bucket, value in values.iteritems():
index = bucket2index.get(bucket, None)
if index is None:
#print "%s's does not feature %s bucket in schema"
# % (h_name, bucket)
index_error = True
break
if type(value) not in (int, long, float):
type_error = True
if verbose:
print >> sys.stderr, "Bad value type: %s " % repr(value)
break
outarray[index] = value
if index_error:
if verbose:
msg = "index is None in map"
print >> sys.stderr, msg
continue
if type_error:
if verbose:
msg = "value is not int, long or float"
print >> sys.stderr, msg
continue
histogram_sum = h_values.get('sum', None)
if histogram_sum is None:
if verbose:
msg = "histogram_sum is None in map"
print >> sys.stderr, msg
continue
if type(histogram_sum) not in (int, long, float):
if verbose:
msg = ("histogram_sum is not int, long or float, but: %s" %
type(histogram_sum))
print >> sys.stderr, msg
continue
# if statistics isn't available we just leave the two slots as zeroes
if 'sum_squares_hi' in h_values and 'sum_squares_lo' in h_values:
outarray[-4] = h_values.get('sum_squares_hi', 0)
outarray[-3] = h_values.get('sum_squares_lo', 0)
elif 'log_sum' in h_values and 'log_sum_squares' in h_values:
outarray[-4] = h_values.get('log_sum', 0)
outarray[-3] = h_values.get('log_sum_squares', 0)
if type(outarray[-4]) not in (int, long, float):
if verbose:
print >> sys.stderr, ("sum_squares_hi or log_sum is type %s" %
type(outarray[-4]))
continue
if type(outarray[-3]) not in (int, long, float):
if verbose:
msg = ("sum_squares_lo or log_sum_squares is type %s" %
type(outarray[-3]))
print >> sys.stderr, msg
continue
outarray[-2] = histogram_sum
outarray[-1] = 1 # count
try:
context.write((channel, appVersion, h_name), {path: outarray})
except TypeError:
dict_locations = [p for p, t in enumerate(path) if type(t) is dict]
if dict_locations:
field_names = ["buildDate", "reason", "appName", "OS",
"osVersion", "arch"]
dict_field_names = [field_names[i] for i in dict_locations]
msg = ("unable to hash the following `path` fields: %s" %
(' '.join(dict_field_names)))
else:
msg = "TypeError when writing map output."
if verbose:
print >> sys.stderr, msg
# For each histogram
for hgramName, hgramValues in payload.get('histograms', {}).iteritems():
# Check that we have bucket information on this histogram
bucket2index = specs.histograms.get(hgramName, None)
if bucket2index == None:
log("Missing bucket2index for %s", hgramName)
continue
# Abort if bucket length doesn't match
if len(hgramValues) == len(bucket2index[0]) + 5:
write_to_cache((channel, majorVersion, hgramName),
{filterPath: hgramValues + [1]}, context)
# Now read and output simple measures
simple_measures = payload.get('simpleMeasurements', None)
if simple_measures is None:
if verbose:
msg = "SimpleMeasures are missing..."
print >> sys.stderr, msg
return
for sm_name, sm_value in simple_measures.iteritems():
for name, value in payload.get('simpleMeasurements', {}).iteritems():
# Handle cases where the value is a dictionary of simple measures
if type(sm_value) == dict:
for sub_name, sub_value in sm_value.iteritems():
map_simplemeasure(channel, appVersion, path,
sm_name + "_" + sub_name, sub_value, context)
if type(value) == dict:
for subName, subValue in value.iteritems():
map_simplemeasure(channel, majorVersion, filterPath,
name + "_" + str(subName), subValue, context)
else:
map_simplemeasure(channel, appVersion, path, sm_name, sm_value,
context)
map_simplemeasure(channel, majorVersion, filterPath, str(name),
value, context)
# Map a simple measure
def map_simplemeasure(channel, appVersion, path, name, value, context):
def map_simplemeasure(channel, majorVersion, filterPath, name, value, context):
# Sanity check value
if type(value) not in (int, long):
if verbose:
msg = ("%s is not a value type for simpleMeasurements \"%s\"" %
(type(value), name))
print >> sys.stderr, msg
if type(value) not in (int, long, float):
log("%s is not a value type for simpleMeasurements \"%s\"",
type(value), name)
return
bucket = simple_measures_buckets[1]
outarray = [0] * (len(bucket) + 5)
outarray = [0] * (len(bucket) + 6)
for i in reversed(range(0, len(bucket))):
if value >= bucket[i]:
outarray[i] = 1
break
log_val = math.log(math.fabs(value) + 1)
outarray[-4] = log_val # log_sum
outarray[-3] = log_val * log_val # log_sum_squares
outarray[-2] = value # sum
outarray[-6] = value # sum
outarray[-5] = log_val # log_sum
outarray[-4] = log_val * log_val # log_sum_squares
outarray[-3] = 0 # sum_squares_lo
outarray[-2] = 0 # sum_squares_hi
outarray[-1] = 1 # count
# Output result array
context.write((channel, appVersion, "SIMPLE_MEASURES_" + name.upper()),
{path: outarray})
write_to_cache((channel, majorVersion, "SIMPLE_MEASURES_" + name.upper()),
{filterPath: outarray}, context)
def commonCombine(values):
out = {}
output = {}
for d in values:
for filter_path, histogram in d.iteritems():
existing = out.get(filter_path, None)
for filterPath, hgramValues in d.iteritems():
existing = output.get(filterPath, None)
if existing is None:
out[filter_path] = histogram
output[filterPath] = hgramValues
continue
for y in range(0, len(histogram)):
existing[y] += (histogram[y] or 0)
return out
def combine(key, values, context):
out = commonCombine(values)
context.write(key, out)
for y in xrange(0, len(hgramValues)):
existing[y] += (hgramValues[y] or 0)
return output
def reduce(key, values, context):
out = commonCombine(values)
out_values = {}
h_name = key[2]
for (filter_path, histogram) in out.iteritems():
# first, discard any malformed (non int) entries, while allowing floats
# in the statistics
for i, val in enumerate(histogram):
T = type(val)
if T is not int:
if T is float:
if i is len(histogram) - 3 or i is len(histogram) - 4:
continue # allow elements of stats to be floats
msg = ("discarding %s - %s malformed type: %s on index %i" %
('/'.join(filter_path), h_name, T, i))
if verbose:
print >> sys.stderr, msg
return
out_values["/".join(filter_path)] = histogram
# Produce output ready for json serialization
output = {}
for filterPath, hgramValues in commonCombine(values).iteritems():
output["/".join(filterPath)] = hgramValues
if h_name.startswith("SIMPLE_MEASURES_"):
# Get histogram name
hgramName = key[2]
if hgramName.startswith("SIMPLE_MEASURES_"):
buckets = simple_measures_buckets[1];
else:
# histogram_specs lookup below is guaranteed to succeed, because of mapper
buckets = histogram_specs.get(h_name)[1]
buckets = specs.histograms.get(hgramName)[1]
# Write final output
final_out = {
'buckets': buckets,
'values': out_values
'buckets': buckets,
'values': output
}
context.write("/".join(key), json.dumps(final_out))
def output(path, results):
f = open(path, 'w')
for k, v in results:
f.write(k + "\t" + v + "\n")

208
test.py
Просмотреть файл

@ -1,208 +0,0 @@
try:
import simplejson as json
except ImportError:
import json
import math
import sys
# Import histogram specs and generated by makefile using specgen.py
import specs
# Counts number of times we've printed a log message
logMsgCount = {}
# Auxiliary method to write log messages
def log(msg, *args):
# We only print a log message the first 10 times we see it
n = logMsgCount.get(msg, 10)
if n > 0:
logMsgCount[msg] = n - 1
print >> sys.stderr, msg % args
# Auxiliary method for computing bucket offsets from parameters, it is stolen
# from histogram_tools.py, though slightly modified...
def exponential_buckets(dmin, dmax, n_buckets):
log_max = math.log(dmax);
ret_array = [0] * n_buckets
current = dmin
ret_array[1] = current
for bucket_index in range(2, n_buckets):
log_current = math.log(current)
log_ratio = (log_max - log_current) / (n_buckets - bucket_index)
log_next = log_current + log_ratio
next_value = int(math.floor(math.exp(log_next) + 0.5))
if next_value > current:
current = next_value
else:
current = current + 1
ret_array[bucket_index] = current
return ret_array
# Create buckets from buckets2index from ranges... snippet pretty much stolen
# from specgen.py
def buckets2index_from_ranges(ranges):
buckets = map(str, ranges)
bucket2index = {}
for i in range(0, len(buckets)):
bucket2index[buckets[i]] = i
return bucket2index
# Bucket offsets for simple measures
simple_measures_buckets = (
buckets2index_from_ranges(
exponential_buckets(1, 30000, 50)),
exponential_buckets(1, 30000, 50)
)
# Cache of all output values, we do a linear scan so we won't write any thing
# until map_finished() gets called... This is hack that makes linear scans
# a lot faster :)
cache = {}
def map_finished(context):
log("map_finished outputing: %s keys", len(cache))
global cache
for key, value in cache.iteritems():
context.write(key, value)
cache = {}
# Auxiliary function for aggregating a result to the cache, we pass in context
# so we can skip caching here should we ever want to do this
def write_to_cache(key, value, context):
global cache
cachedValue = cache.get(key, None)
if cachedValue is None:
cache[key] = value
else:
for filterPath, hgramValues in value.iteritems():
existing = cachedValue.get(filterPath, None)
if existing is None:
cachedValue[filterPath] = hgramValues
continue
for y in xrange(0, len(hgramValues)):
existing[y] += (hgramValues[y] or 0)
# histogram incoming format:
# [
# bucket0, bucket1, ..., bucketN,
# sum, log_sum, log_sum_squares, sum_squares_lo, sum_squares_hi
# ]
# Aggregated histogram format:
# [
# bucket0, bucket1, ..., bucketN,
# sum, log_sum, log_sum_squares, sum_squares_lo, sum_squares_hi, count
# ]
# where count is the number of histograms aggregated in the histogram.
def map(key, dims, value, context):
# Unpack dimensions
reason, appName, channel, version, buildId, submissionDate = dims
# Get the major version
majorVersion = version.split('.')[0]
# Get the build date, ignore the rest of the buildId
buildDate = buildId[:8]
# Load JSON payload
payload = json.loads(value)
# Get OS, osVersion and architecture information
try:
info = payload['info']
OS = info['OS']
osVersion = str(info['version'])
arch = info['arch']
except (KeyError, IndexError, UnicodeEncodeError):
log("error while unpacking the payload")
return
# todo combine OS + osVersion + santize on crazy platforms like linux to
# reduce pointless choices
if OS == "Linux":
osVersion = osVersion[:3]
# Create filter path
filterPath = (buildDate, reason, appName, OS, osVersion, arch)
# For each histogram
for hgramName, hgramValues in payload.get('histograms', {}).iteritems():
# Check that we have bucket information on this histogram
bucket2index = specs.histograms.get(hgramName, None)
if bucket2index == None:
log("Missing bucket2index for %s", hgramName)
continue
# Abort if bucket length doesn't match
if len(hgramValues) == len(bucket2index[0]) + 5:
write_to_cache((channel, majorVersion, hgramName),
{filterPath: hgramValues + [1]}, context)
# Now read and output simple measures
for name, value in payload.get('simpleMeasurements', {}).iteritems():
# Handle cases where the value is a dictionary of simple measures
if type(value) == dict:
for subName, subValue in value.iteritems():
map_simplemeasure(channel, majorVersion, filterPath,
name + "_" + str(subName), subValue, context)
else:
map_simplemeasure(channel, majorVersion, filterPath, str(name),
value, context)
# Map a simple measure
def map_simplemeasure(channel, majorVersion, filterPath, name, value, context):
# Sanity check value
if type(value) not in (int, long, float):
log("%s is not a value type for simpleMeasurements \"%s\"",
type(value), name)
return
bucket = simple_measures_buckets[1]
outarray = [0] * (len(bucket) + 6)
for i in reversed(range(0, len(bucket))):
if value >= bucket[i]:
outarray[i] = 1
break
log_val = math.log(math.fabs(value) + 1)
outarray[-6] = value # sum
outarray[-5] = log_val # log_sum
outarray[-4] = log_val * log_val # log_sum_squares
outarray[-3] = 0 # sum_squares_lo
outarray[-2] = 0 # sum_squares_hi
outarray[-1] = 1 # count
# Output result array
write_to_cache((channel, majorVersion, "SIMPLE_MEASURES_" + name.upper()),
{filterPath: outarray}, context)
def commonCombine(values):
output = {}
for d in values:
for filterPath, hgramValues in d.iteritems():
existing = output.get(filterPath, None)
if existing is None:
output[filterPath] = hgramValues
continue
for y in xrange(0, len(hgramValues)):
existing[y] += (hgramValues[y] or 0)
return output
def reduce(key, values, context):
# Produce output ready for json serialization
output = {}
for filterPath, hgramValues in commonCombine(values).iteritems():
output["/".join(filterPath)] = hgramValues
# Get histogram name
hgramName = key[2]
if hgramName.startswith("SIMPLE_MEASURES_"):
buckets = simple_measures_buckets[1];
else:
buckets = specs.histograms.get(hgramName)[1]
# Write final output
final_out = {
'buckets': buckets,
'values': output
}
context.write("/".join(key), json.dumps(final_out))