telemetry-aggregator/dashboard.py

try:
    import simplejson as json
except ImportError:
    import json
import math
import sys

# Import histogram specs and generated by makefile using specgen.py
import specs

# Counts number of times we've printed a log message
logMsgCount = {}

# Auxiliary method to write log messages
def log(msg, *args):
    # We only print a log message the first 10 times we see it
    n = logMsgCount.get(msg, 10)
    if n > 0:
        logMsgCount[msg] = n - 1
        print >> sys.stderr, msg % args

# Auxiliary method for computing bucket offsets from parameters, it is stolen
# from histogram_tools.py, though slightly modified...
def exponential_buckets(dmin, dmax, n_buckets):
    log_max = math.log(dmax);
    ret_array = [0] * n_buckets
    current = dmin
    ret_array[1] = current
    for bucket_index in range(2, n_buckets):
        log_current = math.log(current)
        log_ratio = (log_max - log_current) / (n_buckets - bucket_index)
        log_next = log_current + log_ratio
        next_value = int(math.floor(math.exp(log_next) + 0.5))
        if next_value > current:
            current = next_value
        else:
            current = current + 1
        ret_array[bucket_index] = current
    return ret_array

# Create buckets from buckets2index from ranges... snippet pretty much stolen
# from specgen.py
def buckets2index_from_ranges(ranges):
    buckets = map(str, ranges)
    bucket2index = {}
    for i in range(0, len(buckets)):
        bucket2index[buckets[i]] = i
    return bucket2index

# Bucket offsets for simple measures
simple_measures_buckets = (
                           buckets2index_from_ranges(
                                            exponential_buckets(1, 30000, 50)),
                           exponential_buckets(1, 30000, 50)
                           )

# Cache of all output values, we do a linear scan so we won't write any thing
# until map_finished() gets called... This is hack that makes linear scans
# a lot faster :)
cache = {}

def map_finished(context):
    log("map_finished outputing: %s keys", len(cache))
    global cache
    for key, value in cache.iteritems():
        context.write(key, value)
    cache = {}

# Auxiliary function for aggregating a result to the cache, we pass in context
# so we can skip caching here should we ever want to do this
def write_to_cache(key, value, context):
    global cache
    cachedValue = cache.get(key, None)
    if cachedValue is None:
        cache[key] = value
    else:
        for filterPath, hgramValues in value.iteritems():
            existing = cachedValue.get(filterPath, None)
            if existing is None:
                cachedValue[filterPath] = hgramValues
                continue
            for y in xrange(0, len(hgramValues)):
                existing[y] += (hgramValues[y] or 0)

# histogram incoming format:
#   [
#       bucket0, bucket1, ..., bucketN,
#       sum, log_sum, log_sum_squares, sum_squares_lo, sum_squares_hi
#   ]
# Aggregated histogram format:
#   [
#       bucket0, bucket1, ..., bucketN,
#       sum, log_sum, log_sum_squares, sum_squares_lo, sum_squares_hi, count
#   ]
# where count is the number of histograms aggregated in the histogram.

def map(key, dims, value, context):
    # Unpack dimensions
    reason, appName, channel, version, buildId, submissionDate = dims

    # Get the major version
    majorVersion = version.split('.')[0]

    # Get the build date, ignore the rest of the buildId
    buildDate = buildId[:8]

    # Load JSON payload
    payload = json.loads(value)

    # Get OS, osVersion and architecture information
    try:
        info = payload['info']
        OS = info['OS']
        osVersion = str(info['version'])
        arch = info['arch']
    except (KeyError, IndexError, UnicodeEncodeError):
        log("error while unpacking the payload")
        return

    # todo combine OS + osVersion + santize on crazy platforms like linux to
    #      reduce pointless choices
    if OS == "Linux":
        osVersion = osVersion[:3]

    # Create filter path
    filterPath = (buildDate, reason, appName, OS, osVersion, arch)

    # For each histogram
    for hgramName, hgramValues in payload.get('histograms', {}).iteritems():
        # Check that we have bucket information on this histogram
        bucket2index = specs.histograms.get(hgramName, None)
        if bucket2index == None:
            log("Missing bucket2index for %s", hgramName)
            continue
        # Abort if bucket length doesn't match
        if len(hgramValues) == len(bucket2index[0]) + 5:
            write_to_cache((channel, majorVersion, hgramName),
                          {filterPath: hgramValues + [1]}, context)

    # Now read and output simple measures
    for name, value in payload.get('simpleMeasurements', {}).iteritems():
        # Handle cases where the value is a dictionary of simple measures
        if type(value) == dict:
            for subName, subValue in value.iteritems():
                map_simplemeasure(channel, majorVersion, filterPath,
                                  name + "_" + str(subName), subValue, context)
        else:
            map_simplemeasure(channel, majorVersion, filterPath, str(name),
                              value, context)

# Map a simple measure
def map_simplemeasure(channel, majorVersion, filterPath, name, value, context):
    # Sanity check value
    if type(value) not in (int, long, float):
        log("%s is not a value type for simpleMeasurements \"%s\"",
            type(value), name)
        return

    bucket = simple_measures_buckets[1]
    outarray = [0] * (len(bucket) + 6)
    for i in reversed(range(0, len(bucket))):
        if value >= bucket[i]:
            outarray[i] = 1
            break

    log_val = math.log(math.fabs(value) + 1)
    outarray[-6] = value                # sum
    outarray[-5] = log_val              # log_sum
    outarray[-4] = log_val * log_val    # log_sum_squares
    outarray[-3] = 0                    # sum_squares_lo
    outarray[-2] = 0                    # sum_squares_hi
    outarray[-1] = 1                    # count

    # Output result array
    write_to_cache((channel, majorVersion, "SIMPLE_MEASURES_" + name.upper()),
                   {filterPath: outarray}, context)

def commonCombine(values):
    output = {}
    for d in values:
        for filterPath, hgramValues in d.iteritems():
            existing = output.get(filterPath, None)
            if existing is None:
                output[filterPath] = hgramValues
                continue
            for y in xrange(0, len(hgramValues)):
                existing[y] += (hgramValues[y] or 0)
    return output

def reduce(key, values, context):
    # Produce output ready for json serialization
    output = {}
    for filterPath, hgramValues in commonCombine(values).iteritems():
        output["/".join(filterPath)] = hgramValues

    # Get histogram name
    hgramName = key[2]
    if hgramName.startswith("SIMPLE_MEASURES_"):
        buckets = simple_measures_buckets[1];
    else:
        buckets = specs.histograms.get(hgramName)[1]

    # Write final output
    final_out = {
        'buckets':  buckets,
        'values':   output
    }
    context.write("/".join(key), json.dumps(final_out))