209 строки
7.1 KiB
Python
209 строки
7.1 KiB
Python
try:
|
|
import simplejson as json
|
|
except ImportError:
|
|
import json
|
|
import math
|
|
import sys
|
|
|
|
# Import histogram specs and generated by makefile using specgen.py
|
|
import specs
|
|
|
|
# Counts number of times we've printed a log message
|
|
logMsgCount = {}
|
|
|
|
# Auxiliary method to write log messages
|
|
def log(msg, *args):
|
|
# We only print a log message the first 10 times we see it
|
|
n = logMsgCount.get(msg, 10)
|
|
if n > 0:
|
|
logMsgCount[msg] = n - 1
|
|
print >> sys.stderr, msg % args
|
|
|
|
# Auxiliary method for computing bucket offsets from parameters, it is stolen
|
|
# from histogram_tools.py, though slightly modified...
|
|
def exponential_buckets(dmin, dmax, n_buckets):
|
|
log_max = math.log(dmax);
|
|
ret_array = [0] * n_buckets
|
|
current = dmin
|
|
ret_array[1] = current
|
|
for bucket_index in range(2, n_buckets):
|
|
log_current = math.log(current)
|
|
log_ratio = (log_max - log_current) / (n_buckets - bucket_index)
|
|
log_next = log_current + log_ratio
|
|
next_value = int(math.floor(math.exp(log_next) + 0.5))
|
|
if next_value > current:
|
|
current = next_value
|
|
else:
|
|
current = current + 1
|
|
ret_array[bucket_index] = current
|
|
return ret_array
|
|
|
|
# Create buckets from buckets2index from ranges... snippet pretty much stolen
|
|
# from specgen.py
|
|
def buckets2index_from_ranges(ranges):
|
|
buckets = map(str, ranges)
|
|
bucket2index = {}
|
|
for i in range(0, len(buckets)):
|
|
bucket2index[buckets[i]] = i
|
|
return bucket2index
|
|
|
|
# Bucket offsets for simple measures
|
|
simple_measures_buckets = (
|
|
buckets2index_from_ranges(
|
|
exponential_buckets(1, 30000, 50)),
|
|
exponential_buckets(1, 30000, 50)
|
|
)
|
|
|
|
# Cache of all output values, we do a linear scan so we won't write any thing
|
|
# until map_finished() gets called... This is hack that makes linear scans
|
|
# a lot faster :)
|
|
cache = {}
|
|
|
|
def map_finished(context):
|
|
log("map_finished outputing: %s keys", len(cache))
|
|
global cache
|
|
for key, value in cache.iteritems():
|
|
context.write(key, value)
|
|
cache = {}
|
|
|
|
# Auxiliary function for aggregating a result to the cache, we pass in context
|
|
# so we can skip caching here should we ever want to do this
|
|
def write_to_cache(key, value, context):
|
|
global cache
|
|
cachedValue = cache.get(key, None)
|
|
if cachedValue is None:
|
|
cache[key] = value
|
|
else:
|
|
for filterPath, hgramValues in value.iteritems():
|
|
existing = cachedValue.get(filterPath, None)
|
|
if existing is None:
|
|
cachedValue[filterPath] = hgramValues
|
|
continue
|
|
for y in xrange(0, len(hgramValues)):
|
|
existing[y] += (hgramValues[y] or 0)
|
|
|
|
# histogram incoming format:
|
|
# [
|
|
# bucket0, bucket1, ..., bucketN,
|
|
# sum, log_sum, log_sum_squares, sum_squares_lo, sum_squares_hi
|
|
# ]
|
|
# Aggregated histogram format:
|
|
# [
|
|
# bucket0, bucket1, ..., bucketN,
|
|
# sum, log_sum, log_sum_squares, sum_squares_lo, sum_squares_hi, count
|
|
# ]
|
|
# where count is the number of histograms aggregated in the histogram.
|
|
|
|
def map(key, dims, value, context):
|
|
# Unpack dimensions
|
|
reason, appName, channel, version, buildId, submissionDate = dims
|
|
|
|
# Get the major version
|
|
majorVersion = version.split('.')[0]
|
|
|
|
# Get the build date, ignore the rest of the buildId
|
|
buildDate = buildId[:8]
|
|
|
|
# Load JSON payload
|
|
payload = json.loads(value)
|
|
|
|
# Get OS, osVersion and architecture information
|
|
try:
|
|
info = payload['info']
|
|
OS = info['OS']
|
|
osVersion = str(info['version'])
|
|
arch = info['arch']
|
|
except (KeyError, IndexError, UnicodeEncodeError):
|
|
log("error while unpacking the payload")
|
|
return
|
|
|
|
# todo combine OS + osVersion + santize on crazy platforms like linux to
|
|
# reduce pointless choices
|
|
if OS == "Linux":
|
|
osVersion = osVersion[:3]
|
|
|
|
# Create filter path
|
|
filterPath = (buildDate, reason, appName, OS, osVersion, arch)
|
|
|
|
# For each histogram
|
|
for hgramName, hgramValues in payload.get('histograms', {}).iteritems():
|
|
# Check that we have bucket information on this histogram
|
|
bucket2index = specs.histograms.get(hgramName, None)
|
|
if bucket2index == None:
|
|
log("Missing bucket2index for %s", hgramName)
|
|
continue
|
|
# Abort if bucket length doesn't match
|
|
if len(hgramValues) == len(bucket2index[0]) + 5:
|
|
write_to_cache((channel, majorVersion, hgramName),
|
|
{filterPath: hgramValues + [1]}, context)
|
|
|
|
# Now read and output simple measures
|
|
for name, value in payload.get('simpleMeasurements', {}).iteritems():
|
|
# Handle cases where the value is a dictionary of simple measures
|
|
if type(value) == dict:
|
|
for subName, subValue in value.iteritems():
|
|
map_simplemeasure(channel, majorVersion, filterPath,
|
|
name + "_" + str(subName), subValue, context)
|
|
else:
|
|
map_simplemeasure(channel, majorVersion, filterPath, str(name),
|
|
value, context)
|
|
|
|
# Map a simple measure
|
|
def map_simplemeasure(channel, majorVersion, filterPath, name, value, context):
|
|
# Sanity check value
|
|
if type(value) not in (int, long, float):
|
|
log("%s is not a value type for simpleMeasurements \"%s\"",
|
|
type(value), name)
|
|
return
|
|
|
|
bucket = simple_measures_buckets[1]
|
|
outarray = [0] * (len(bucket) + 6)
|
|
for i in reversed(range(0, len(bucket))):
|
|
if value >= bucket[i]:
|
|
outarray[i] = 1
|
|
break
|
|
|
|
log_val = math.log(math.fabs(value) + 1)
|
|
outarray[-6] = value # sum
|
|
outarray[-5] = log_val # log_sum
|
|
outarray[-4] = log_val * log_val # log_sum_squares
|
|
outarray[-3] = 0 # sum_squares_lo
|
|
outarray[-2] = 0 # sum_squares_hi
|
|
outarray[-1] = 1 # count
|
|
|
|
# Output result array
|
|
write_to_cache((channel, majorVersion, "SIMPLE_MEASURES_" + name.upper()),
|
|
{filterPath: outarray}, context)
|
|
|
|
def commonCombine(values):
|
|
output = {}
|
|
for d in values:
|
|
for filterPath, hgramValues in d.iteritems():
|
|
existing = output.get(filterPath, None)
|
|
if existing is None:
|
|
output[filterPath] = hgramValues
|
|
continue
|
|
for y in xrange(0, len(hgramValues)):
|
|
existing[y] += (hgramValues[y] or 0)
|
|
return output
|
|
|
|
def reduce(key, values, context):
|
|
# Produce output ready for json serialization
|
|
output = {}
|
|
for filterPath, hgramValues in commonCombine(values).iteritems():
|
|
output["/".join(filterPath)] = hgramValues
|
|
|
|
# Get histogram name
|
|
hgramName = key[2]
|
|
if hgramName.startswith("SIMPLE_MEASURES_"):
|
|
buckets = simple_measures_buckets[1];
|
|
else:
|
|
buckets = specs.histograms.get(hgramName)[1]
|
|
|
|
# Write final output
|
|
final_out = {
|
|
'buckets': buckets,
|
|
'values': output
|
|
}
|
|
context.write("/".join(key), json.dumps(final_out))
|