Renamed test and moved scripts to zip archieve for ease of deployment

2013-09-25 19:07:11 +02:00 · 2013-09-25 19:07:11 +02:00 · d62a79e6d8
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,5 @@
 *.pyc
-
+dashboard.zip
 histogram_tools.py
 histogram_specs.py
 specs.py
--- a/7
+++ b/7
@ -1,5 +1,5 @@
-FILES= histogram_tools.py Histograms.json specs.py
-download: $(FILES)
+FILES= histogram_tools.py Histograms.json specs.py dashboard.zip
+all: $(FILES)

 Histograms.json:
 	wget -c http://hg.mozilla.org/mozilla-central/raw-file/tip/toolkit/components/telemetry/Histograms.json -O $@
@ -10,5 +10,8 @@ histogram_tools.py:
 specs.py: Histograms.json
 	python specgen.py $< > $@

+dashboard.zip: specs.py dashboard.py
+	zip $@ $?
+
 clean:
 	rm -f $(FILES)
--- a/dashboard.py
+++ b/dashboard.py
@ -1,13 +1,23 @@
 try:
    import simplejson as json
-    print "Using simplejson for faster json parsing"
 except ImportError:
    import json
-import sys
 import math
+import sys

+# Import histogram specs and generated by makefile using specgen.py
+import specs

-verbose = True
+# Counts number of times we've printed a log message
+logMsgCount = {}
+
+# Auxiliary method to write log messages
+def log(msg, *args):
+    # We only print a log message the first 10 times we see it
+    n = logMsgCount.get(msg, 10)
+    if n > 0:
+        logMsgCount[msg] = n - 1
+        print >> sys.stderr, msg % args

 # Auxiliary method for computing bucket offsets from parameters, it is stolen
 # from histogram_tools.py, though slightly modified...
@ -44,35 +54,67 @@ simple_measures_buckets = (
                           exponential_buckets(1, 30000, 50)
                           )

+# Cache of all output values, we do a linear scan so we won't write any thing
+# until map_finished() gets called... This is hack that makes linear scans
+# a lot faster :)
+cache = {}

-SPECS = "scripts/histogram_specs.json"
-histogram_specs = json.loads(
-    jydoop.getResource(SPECS))
+def map_finished(context):
+    log("map_finished outputing: %s keys", len(cache))
+    global cache
+    for key, value in cache.iteritems():
+        context.write(key, value)
+    cache = {}

-def map(uid, line, context):
-    global histogram_specs
+# Auxiliary function for aggregating a result to the cache, we pass in context
+# so we can skip caching here should we ever want to do this
+def write_to_cache(key, value, context):
+    global cache
+    cachedValue = cache.get(key, None)
+    if cachedValue is None:
+        cache[key] = value
+    else:
+        for filterPath, hgramValues in value.iteritems():
+            existing = cachedValue.get(filterPath, None)
+            if existing is None:
+                cachedValue[filterPath] = hgramValues
+                continue
+            for y in xrange(0, len(hgramValues)):
+                existing[y] += (hgramValues[y] or 0)

-    payload = json.loads(line)
+# histogram incoming format:
+#   [
+#       bucket0, bucket1, ..., bucketN,
+#       sum, log_sum, log_sum_squares, sum_squares_lo, sum_squares_hi
+#   ]
+# Aggregated histogram format:
+#   [
+#       bucket0, bucket1, ..., bucketN,
+#       sum, log_sum, log_sum_squares, sum_squares_lo, sum_squares_hi, count
+#   ]
+# where count is the number of histograms aggregated in the histogram.
+
+def map(key, dims, value, context):
+    # Unpack dimensions
+    reason, appName, channel, version, buildId, submissionDate = dims
+
+    # Get the major version
+    majorVersion = version.split('.')[0]
+
+    # Get the build date, ignore the rest of the buildId
+    buildDate = buildId[:8]
+
+    # Load JSON payload
+    payload = json.loads(value)
+
+    # Get OS, osVersion and architecture information
    try:
-        i = payload['info']
-        channel = i.get('appUpdateChannel', "too_old")
-        OS = i['OS']
-        appName = i['appName']
-        reason = i['reason']
-        osVersion = str(i['version'])
-        #only care about major versions
-        appVersion = i['appVersion'].split('.')[0]
-        arch = i['arch']
-        buildDate = i['appBuildID'][:8]
+        info = payload['info']
+        OS = info['OS']
+        osVersion = str(info['version'])
+        arch = info['arch']
    except (KeyError, IndexError, UnicodeEncodeError):
-        if verbose:
-            msg = "error while unpacking the payload"
-            print >> sys.stderr, msg
-        return
-
-    # TODO: histogram_specs should specify the list of versions/channels we
-    #       care about
-    if not channel in ['release', 'aurora', 'nightly', 'beta', 'nightly-ux']:
+        log("error while unpacking the payload")
        return

    # todo combine OS + osVersion + santize on crazy platforms like linux to
@ -80,226 +122,87 @@ def map(uid, line, context):
    if OS == "Linux":
        osVersion = osVersion[:3]

-    path = (buildDate, reason, appName, OS, osVersion, arch)
-    # Sanitize path
-    for val in path:
-        if not isinstance(val, basestring) and type(val) in (int, float, long):
-            if verbose:
-                print >> sys.stderr, "Found type %s in path" % type(val)
-            return
+    # Create filter path
+    filterPath = (buildDate, reason, appName, OS, osVersion, arch)

-    # Sanitize channel and appVersion
-    for val in (channel, appVersion):
-        if not isinstance(val, basestring) and type(val) in (int, float, long):
-            if verbose:
-                print >> sys.stderr, ("Found type %s in channel or appVersion" %
-                                      type(val))
-            return
-
-    histograms = payload.get('histograms', None)
-    if histograms is None:
-        histograms = {}
-        if verbose:
-            msg = "histograms is None in map"
-            print >> sys.stderr, msg
-    for h_name, h_values in histograms.iteritems():
-        bucket2index = histogram_specs.get(h_name, None)
-        if bucket2index is None:
-            if verbose:
-                msg = "bucket2index is None in map"
-                print >> sys.stderr, msg
+    # For each histogram
+    for hgramName, hgramValues in payload.get('histograms', {}).iteritems():
+        # Check that we have bucket information on this histogram
+        bucket2index = specs.histograms.get(hgramName, None)
+        if bucket2index == None:
+            log("Missing bucket2index for %s", hgramName)
            continue
-        else:
-            bucket2index = bucket2index[0]
-
-        # most buckets contain 0s, so preallocation is a significant win
-        outarray = [0] * (len(bucket2index) + 4)
-
-        index_error = False
-        type_error = False
-        if not isinstance(h_values, dict):
-            if verbose:
-                msg = "h_values is not a dictionary"
-                print >> sys.stderr, msg
-            continue
-
-        try:
-            values = h_values.get('values', None)
-        except AttributeError:
-            msg = "h_values was not a dict"
-            print >> sys.stderr, msg
-            return
-        if values is None:
-            continue
-        for bucket, value in values.iteritems():
-            index = bucket2index.get(bucket, None)
-            if index is None:
-                #print "%s's does not feature %s bucket in schema"
-                #    % (h_name, bucket)
-                index_error = True
-                break
-            if type(value) not in (int, long, float):
-                type_error = True
-                if verbose:
-                    print >> sys.stderr, "Bad value type: %s " % repr(value)
-                break
-            outarray[index] = value
-        if index_error:
-            if verbose:
-                msg = "index is None in map"
-                print >> sys.stderr, msg
-            continue
-        if type_error:
-            if verbose:
-                msg = "value is not int, long or float"
-                print >> sys.stderr, msg
-            continue
-
-        histogram_sum = h_values.get('sum', None)
-        if histogram_sum is None:
-            if verbose:
-                msg = "histogram_sum is None in map"
-                print >> sys.stderr, msg
-            continue
-        if type(histogram_sum) not in (int, long, float):
-            if verbose:
-                msg = ("histogram_sum is not int, long or float, but: %s" %
-                       type(histogram_sum))
-                print >> sys.stderr, msg
-            continue
-        # if statistics isn't available we just leave the two slots as zeroes
-        if 'sum_squares_hi' in h_values and 'sum_squares_lo' in h_values:
-            outarray[-4] = h_values.get('sum_squares_hi', 0)
-            outarray[-3] = h_values.get('sum_squares_lo', 0)
-        elif 'log_sum' in h_values and 'log_sum_squares' in h_values:
-            outarray[-4] = h_values.get('log_sum', 0)
-            outarray[-3] = h_values.get('log_sum_squares', 0)
-        if type(outarray[-4]) not in (int, long, float):
-            if verbose:
-                print >> sys.stderr, ("sum_squares_hi or log_sum is type %s" %
-                                      type(outarray[-4]))
-            continue
-        if type(outarray[-3]) not in (int, long, float):
-            if verbose:
-                msg = ("sum_squares_lo or log_sum_squares is type %s" %
-                       type(outarray[-3]))
-                print >> sys.stderr, msg
-            continue
-        outarray[-2] = histogram_sum
-        outarray[-1] = 1        # count
-        try:
-            context.write((channel, appVersion, h_name), {path: outarray})
-        except TypeError:
-            dict_locations = [p for p, t in enumerate(path) if type(t) is dict]
-            if dict_locations:
-                field_names = ["buildDate", "reason", "appName", "OS",
-                               "osVersion", "arch"]
-                dict_field_names = [field_names[i] for i in dict_locations]
-                msg = ("unable to hash the following `path` fields: %s" %
-                       (' '.join(dict_field_names)))
-            else:
-                msg = "TypeError when writing map output."
-            if verbose:
-                print >> sys.stderr, msg
-            continue
-
+        # Abort if bucket length doesn't match
+        if len(hgramValues) == len(bucket2index[0]) + 5:
+            write_to_cache((channel, majorVersion, hgramName),
+                          {filterPath: hgramValues + [1]}, context)
+    
    # Now read and output simple measures
-    simple_measures = payload.get('simpleMeasurements', None)
-    if simple_measures is None:
-        if verbose:
-            msg = "SimpleMeasures are missing..."
-            print >> sys.stderr, msg
-        return
-    for sm_name, sm_value in simple_measures.iteritems():
+    for name, value in payload.get('simpleMeasurements', {}).iteritems():
        # Handle cases where the value is a dictionary of simple measures
-        if type(sm_value) == dict:
-            for sub_name, sub_value in sm_value.iteritems():
-                map_simplemeasure(channel, appVersion, path,
-                                  sm_name + "_" + sub_name, sub_value, context)
+        if type(value) == dict:
+            for subName, subValue in value.iteritems():
+                map_simplemeasure(channel, majorVersion, filterPath,
+                                  name + "_" + str(subName), subValue, context)
        else:
-            map_simplemeasure(channel, appVersion, path, sm_name, sm_value,
-                              context)
-
+            map_simplemeasure(channel, majorVersion, filterPath, str(name),
+                              value, context)

 # Map a simple measure
-def map_simplemeasure(channel, appVersion, path, name, value, context):
+def map_simplemeasure(channel, majorVersion, filterPath, name, value, context):
    # Sanity check value
-    if type(value) not in (int, long):
-        if verbose:
-            msg = ("%s is not a value type for simpleMeasurements \"%s\"" %
-                   (type(value), name))
-            print >> sys.stderr, msg
+    if type(value) not in (int, long, float):
+        log("%s is not a value type for simpleMeasurements \"%s\"",
+            type(value), name)
        return

    bucket = simple_measures_buckets[1]
-    outarray = [0] * (len(bucket) + 5)
+    outarray = [0] * (len(bucket) + 6)
    for i in reversed(range(0, len(bucket))):
        if value >= bucket[i]:
            outarray[i] = 1
            break

    log_val = math.log(math.fabs(value) + 1)
-    outarray[-4] = log_val              # log_sum
-    outarray[-3] = log_val * log_val    # log_sum_squares
-    outarray[-2] = value                # sum
+    outarray[-6] = value                # sum
+    outarray[-5] = log_val              # log_sum
+    outarray[-4] = log_val * log_val    # log_sum_squares
+    outarray[-3] = 0                    # sum_squares_lo
+    outarray[-2] = 0                    # sum_squares_hi
    outarray[-1] = 1                    # count

    # Output result array
-    context.write((channel, appVersion, "SIMPLE_MEASURES_" + name.upper()), 
-                  {path: outarray})
-
+    write_to_cache((channel, majorVersion, "SIMPLE_MEASURES_" + name.upper()), 
+                   {filterPath: outarray}, context)

 def commonCombine(values):
-    out = {}
+    output = {}
    for d in values:
-        for filter_path, histogram in d.iteritems():
-            existing = out.get(filter_path, None)
+        for filterPath, hgramValues in d.iteritems():
+            existing = output.get(filterPath, None)
            if existing is None:
-                out[filter_path] = histogram
+                output[filterPath] = hgramValues
                continue
-            for y in range(0, len(histogram)):
-                existing[y] += (histogram[y] or 0)
-    return out
-
-
-def combine(key, values, context):
-    out = commonCombine(values)
-    context.write(key, out)
-
+            for y in xrange(0, len(hgramValues)):
+                existing[y] += (hgramValues[y] or 0)
+    return output

 def reduce(key, values, context):
-    out = commonCombine(values)
-    out_values = {}
-    h_name = key[2]
-    for (filter_path, histogram) in out.iteritems():
-        # first, discard any malformed (non int) entries, while allowing floats
-        # in the statistics
-        for i, val in enumerate(histogram):
-            T = type(val)
-            if T is not int:
-                if T is float:
-                    if i is len(histogram) - 3 or i is len(histogram) - 4:
-                        continue # allow elements of stats to be floats
-                msg = ("discarding %s - %s malformed type: %s on index %i" %
-                       ('/'.join(filter_path), h_name, T, i))
-                if verbose:
-                    print >> sys.stderr, msg
-                return
-        out_values["/".join(filter_path)] = histogram
+    # Produce output ready for json serialization
+    output = {}
+    for filterPath, hgramValues in commonCombine(values).iteritems():
+        output["/".join(filterPath)] = hgramValues

-    if h_name.startswith("SIMPLE_MEASURES_"):
+    # Get histogram name
+    hgramName = key[2]
+    if hgramName.startswith("SIMPLE_MEASURES_"):
        buckets = simple_measures_buckets[1];
    else:
-        # histogram_specs lookup below is guaranteed to succeed, because of mapper
-        buckets = histogram_specs.get(h_name)[1]
+        buckets = specs.histograms.get(hgramName)[1]
+
+    # Write final output
    final_out = {
-        'buckets': buckets,
-        'values': out_values
+        'buckets':  buckets,
+        'values':   output
    }
    context.write("/".join(key), json.dumps(final_out))
-
-
-def output(path, results):
-    f = open(path, 'w')
-    for k, v in results:
-        f.write(k + "\t" + v + "\n")
--- a/test.py
+++ b/test.py
@ -1,208 +0,0 @@
-try:
-    import simplejson as json
-except ImportError:
-    import json
-import math
-import sys
-
-# Import histogram specs and generated by makefile using specgen.py
-import specs
-
-# Counts number of times we've printed a log message
-logMsgCount = {}
-
-# Auxiliary method to write log messages
-def log(msg, *args):
-    # We only print a log message the first 10 times we see it
-    n = logMsgCount.get(msg, 10)
-    if n > 0:
-        logMsgCount[msg] = n - 1
-        print >> sys.stderr, msg % args
-
-# Auxiliary method for computing bucket offsets from parameters, it is stolen
-# from histogram_tools.py, though slightly modified...
-def exponential_buckets(dmin, dmax, n_buckets):
-    log_max = math.log(dmax);
-    ret_array = [0] * n_buckets
-    current = dmin
-    ret_array[1] = current
-    for bucket_index in range(2, n_buckets):
-        log_current = math.log(current)
-        log_ratio = (log_max - log_current) / (n_buckets - bucket_index)
-        log_next = log_current + log_ratio
-        next_value = int(math.floor(math.exp(log_next) + 0.5))
-        if next_value > current:
-            current = next_value
-        else:
-            current = current + 1
-        ret_array[bucket_index] = current
-    return ret_array
-
-# Create buckets from buckets2index from ranges... snippet pretty much stolen
-# from specgen.py
-def buckets2index_from_ranges(ranges):
-    buckets = map(str, ranges)
-    bucket2index = {}
-    for i in range(0, len(buckets)):
-        bucket2index[buckets[i]] = i
-    return bucket2index
-
-# Bucket offsets for simple measures
-simple_measures_buckets = (
-                           buckets2index_from_ranges(
-                                            exponential_buckets(1, 30000, 50)),
-                           exponential_buckets(1, 30000, 50)
-                           )
-
-# Cache of all output values, we do a linear scan so we won't write any thing
-# until map_finished() gets called... This is hack that makes linear scans
-# a lot faster :)
-cache = {}
-
-def map_finished(context):
-    log("map_finished outputing: %s keys", len(cache))
-    global cache
-    for key, value in cache.iteritems():
-        context.write(key, value)
-    cache = {}
-
-# Auxiliary function for aggregating a result to the cache, we pass in context
-# so we can skip caching here should we ever want to do this
-def write_to_cache(key, value, context):
-    global cache
-    cachedValue = cache.get(key, None)
-    if cachedValue is None:
-        cache[key] = value
-    else:
-        for filterPath, hgramValues in value.iteritems():
-            existing = cachedValue.get(filterPath, None)
-            if existing is None:
-                cachedValue[filterPath] = hgramValues
-                continue
-            for y in xrange(0, len(hgramValues)):
-                existing[y] += (hgramValues[y] or 0)
-
-# histogram incoming format:
-#   [
-#       bucket0, bucket1, ..., bucketN,
-#       sum, log_sum, log_sum_squares, sum_squares_lo, sum_squares_hi
-#   ]
-# Aggregated histogram format:
-#   [
-#       bucket0, bucket1, ..., bucketN,
-#       sum, log_sum, log_sum_squares, sum_squares_lo, sum_squares_hi, count
-#   ]
-# where count is the number of histograms aggregated in the histogram.
-
-def map(key, dims, value, context):
-    # Unpack dimensions
-    reason, appName, channel, version, buildId, submissionDate = dims
-
-    # Get the major version
-    majorVersion = version.split('.')[0]
-
-    # Get the build date, ignore the rest of the buildId
-    buildDate = buildId[:8]
-
-    # Load JSON payload
-    payload = json.loads(value)
-
-    # Get OS, osVersion and architecture information
-    try:
-        info = payload['info']
-        OS = info['OS']
-        osVersion = str(info['version'])
-        arch = info['arch']
-    except (KeyError, IndexError, UnicodeEncodeError):
-        log("error while unpacking the payload")
-        return
-
-    # todo combine OS + osVersion + santize on crazy platforms like linux to
-    #      reduce pointless choices
-    if OS == "Linux":
-        osVersion = osVersion[:3]
-
-    # Create filter path
-    filterPath = (buildDate, reason, appName, OS, osVersion, arch)
-
-    # For each histogram
-    for hgramName, hgramValues in payload.get('histograms', {}).iteritems():
-        # Check that we have bucket information on this histogram
-        bucket2index = specs.histograms.get(hgramName, None)
-        if bucket2index == None:
-            log("Missing bucket2index for %s", hgramName)
-            continue
-        # Abort if bucket length doesn't match
-        if len(hgramValues) == len(bucket2index[0]) + 5:
-            write_to_cache((channel, majorVersion, hgramName),
-                          {filterPath: hgramValues + [1]}, context)
-    
-    # Now read and output simple measures
-    for name, value in payload.get('simpleMeasurements', {}).iteritems():
-        # Handle cases where the value is a dictionary of simple measures
-        if type(value) == dict:
-            for subName, subValue in value.iteritems():
-                map_simplemeasure(channel, majorVersion, filterPath,
-                                  name + "_" + str(subName), subValue, context)
-        else:
-            map_simplemeasure(channel, majorVersion, filterPath, str(name),
-                              value, context)
-
-# Map a simple measure
-def map_simplemeasure(channel, majorVersion, filterPath, name, value, context):
-    # Sanity check value
-    if type(value) not in (int, long, float):
-        log("%s is not a value type for simpleMeasurements \"%s\"",
-            type(value), name)
-        return
-
-    bucket = simple_measures_buckets[1]
-    outarray = [0] * (len(bucket) + 6)
-    for i in reversed(range(0, len(bucket))):
-        if value >= bucket[i]:
-            outarray[i] = 1
-            break
-
-    log_val = math.log(math.fabs(value) + 1)
-    outarray[-6] = value                # sum
-    outarray[-5] = log_val              # log_sum
-    outarray[-4] = log_val * log_val    # log_sum_squares
-    outarray[-3] = 0                    # sum_squares_lo
-    outarray[-2] = 0                    # sum_squares_hi
-    outarray[-1] = 1                    # count
-
-    # Output result array
-    write_to_cache((channel, majorVersion, "SIMPLE_MEASURES_" + name.upper()), 
-                   {filterPath: outarray}, context)
-
-def commonCombine(values):
-    output = {}
-    for d in values:
-        for filterPath, hgramValues in d.iteritems():
-            existing = output.get(filterPath, None)
-            if existing is None:
-                output[filterPath] = hgramValues
-                continue
-            for y in xrange(0, len(hgramValues)):
-                existing[y] += (hgramValues[y] or 0)
-    return output
-
-def reduce(key, values, context):
-    # Produce output ready for json serialization
-    output = {}
-    for filterPath, hgramValues in commonCombine(values).iteritems():
-        output["/".join(filterPath)] = hgramValues
-
-    # Get histogram name
-    hgramName = key[2]
-    if hgramName.startswith("SIMPLE_MEASURES_"):
-        buckets = simple_measures_buckets[1];
-    else:
-        buckets = specs.histograms.get(hgramName)[1]
-
-    # Write final output
-    final_out = {
-        'buckets':  buckets,
-        'values':   output
-    }
-    context.write("/".join(key), json.dumps(final_out))