Added cron job to fetch chromium histograms

2015-02-10 18:31:50 +01:00 · 2015-02-10 18:31:50 +01:00 · 03dd210818
--- a/README.txt
+++ b/README.txt
@ -37,18 +37,18 @@
 *
 * Handler File
 *
-* visiting /metrics/admin calls YesterdayHandler which retrieves yesterday's data from the UMA Cloud Storage.
+* visiting /cron/metrics calls YesterdayHandler which retrieves yesterday's data from the UMA Cloud Storage.
 * This is how the cron job is updating - Daily grabbing the previous day's data
 * The data is parsed and stored as:
-* class StableInstance(webapp2.RequestHandler):
-*     propertyName = db.StringProperty();
-*     bucketID = db.IntegerProperty();
-*     date = db.DateTimeProperty();
-*     hits = db.IntegerProperty();
-*     totalFlushes = db.IntegerProperty();
-*     dayPercentage = db.FloatProperty();
-*     rollingPercentage = db.FloatProperty();
+* class StableInstance(DictModel):
+*     property_name = db.StringProperty();
+*     bucket_id = db.IntegerProperty();
+*     date = db.DateProperty();
+*     day_percentage = db.FloatProperty();
+*     rolling_percentage = db.FloatProperty();
 *
+* visiting /cron/histograms calls HistogramsHandler which retrieves FeatureObserver and
+* FeatureObserver histograms from chromium.googlesource.com.
 * 
 * ACTION REQUIRED: we will need to replace histogramID with the appropriate ID.
 * This can be obtained from uma.googleplex.com/data/histograms/ids-chrome-histograms.txt
@ -58,20 +58,13 @@
 * 


-** uma.py
-*
-* property_name provides a mapping for bucketID to human readable property name
-*
-**
-
-
 ** featurelevel.js
 *
 * Creates charts for the feature level page.
 * 
 * drawVisualization()
 * This function takes in the name of the property for which the graph is being drawn.
-* (This should probably be changed to the proeprtyID/bucketID in the future.) 
+* (This should probably be changed to the propertyID/bucketID in the future.)
 * We iterate through parsed data, building up a data object which we can pass to chart.draw()
 * The desired form of data to pass to chart.draw() is:
 *  [[Date,    Name,      Percentage]
--- a/admin.py
+++ b/admin.py
@ -26,6 +26,7 @@ import os
 import re
 import sys
 import webapp2
+import xml.dom.minidom

 # Appengine imports.
 from google.appengine.api import files
@ -40,17 +41,21 @@ from google.appengine.ext.webapp import blobstore_handlers
 import common
 import models
 import settings
-import uma


 # uma.googleplex.com/data/histograms/ids-chrome-histograms.txt
 BIGSTORE_BUCKET = '/gs/uma-dashboards/'
 BIGSTORE_RESTFUL_URI = 'https://uma-dashboards.storage.googleapis.com/'

+HISTOGRAMS_URL = 'https://chromium.googlesource.com/chromium/src/+/master/' \
+    'tools/metrics/histograms/histograms.xml?format=TEXT'
+
 CSSPROPERITES_BS_HISTOGRAM_ID = str(0xbfd59b316a6c31f1)
 ANIMATIONPROPS_BS_HISTOGRAM_ID = str(0xbee14b73f4fdde73)
 FEATURE_OBSERVER_BS_HISTOGRAM_ID = str(0x2e44945129413683)

+PAGE_VISITS_BUCKET_ID = 52
+
 # For fetching files from the production BigStore during development.
 OAUTH2_CREDENTIALS_FILENAME = os.path.join(
    settings.ROOT_DIR, 'scripts', 'oauth2.data')
@ -99,29 +104,28 @@ class YesterdayHandler(blobstore_handlers.BlobstoreDownloadHandler):

    # For CSSPROPERITES_BS_HISTOGRAM_ID, bucket 1 is total pages visited for
    # stank rank histogram. We're guaranteed to have it.
-    # For the FEATURE_OBSERVER_BS_HISTOGRAM_ID, the PageVisits bucket_id is 52
-    # See uma.py. The actual % is calculated from the count / this number.
+    # For the FEATURE_OBSERVER_BS_HISTOGRAM_ID, the PageVisits bucket_id is 52.
+    # The actual % is calculated from the count / this number.
    # For ANIMATIONPROPS_BS_HISTOGRAM_ID, we have to calculate the total count.
    if 1 in properties_dict and histogram_id == CSSPROPERITES_BS_HISTOGRAM_ID:
      total_pages = properties_dict.get(1)
-    elif (uma.PAGE_VISITS_BUCKET_ID in properties_dict and
+    elif (PAGE_VISITS_BUCKET_ID in properties_dict and
          histogram_id == FEATURE_OBSERVER_BS_HISTOGRAM_ID):
-      total_pages = properties_dict.get(uma.PAGE_VISITS_BUCKET_ID)
+      total_pages = properties_dict.get(PAGE_VISITS_BUCKET_ID)

      # Don't include PageVisits results.
-      del properties_dict[uma.PAGE_VISITS_BUCKET_ID]
+      del properties_dict[PAGE_VISITS_BUCKET_ID]
    else:
      total_pages = sum(properties_dict.values())

+    property_map = models.CssPropertyHistogram.get_all()
+    if histogram_id == FEATURE_OBSERVER_BS_HISTOGRAM_ID:
+      property_map = models.FeatureObserverHistogram.get_all()
+
    for bucket_id, num_hits in properties_dict.items():
      # If the id is not in the map, use 'ERROR' for the name.
      # TODO(ericbidelman): Non-matched bucket ids are likely new properties
-      # that have been added and need to be updated in uma.py. Find way to
-      # autofix these values with the appropriate property_name later.
-      property_map = uma.CSS_PROPERTY_BUCKETS
-      if histogram_id == FEATURE_OBSERVER_BS_HISTOGRAM_ID:
-        property_map = uma.FEATUREOBSERVER_BUCKETS
-
+      # that have been added and will be updated in cron/histograms.
      property_name = property_map.get(bucket_id, 'ERROR')

      query = model_class.all()
@ -216,6 +220,67 @@ class YesterdayHandler(blobstore_handlers.BlobstoreDownloadHandler):
    return (result.content, result.status_code)


+class HistogramsHandler(webapp2.RequestHandler):
+
+  MODEL_CLASS = {
+    'FeatureObserver': models.FeatureObserverHistogram,
+    'MappedCSSProperties': models.CssPropertyHistogram,
+  }
+
+  def _SaveData(self, data, histogram_id):
+    try:
+      model_class = self.MODEL_CLASS[histogram_id]
+    except Exception:
+      logging.error('Invalid Histogram id used: %s' % histogram_id)
+      return
+
+    bucket_id = int(data['bucket_id'])
+    property_name = data['property_name']
+    key_name = '%s_%s' % (bucket_id, property_name)
+
+    # Bucket ID 1 is reserved for number of CSS Pages Visited. So don't add it.
+    if (model_class == models.CssPropertyHistogram and bucket_id == 1):
+      return
+
+    model_class.get_or_insert(key_name,
+      bucket_id=bucket_id,
+      property_name=property_name
+    )
+
+  def get(self):
+    # Attempt to fetch the histograms.xml file from chromium.googlesource.com.
+    result = urlfetch.fetch(HISTOGRAMS_URL)
+
+    if (result.status_code != 200):
+      logging.error('Unable to retrieve chromium histograms.')
+      return
+
+    browsed_histograms = []
+    histograms_content = result.content.decode('base64')
+    dom = xml.dom.minidom.parseString(histograms_content)
+
+    # The histograms.xml file looks like this:
+    #
+    # ...
+    # <enum name="FeatureObserver" type="int">
+    #   <int value="0" label="PageDestruction"/>
+    #   <int value="1" label="LegacyNotifications"/>
+
+    for enum in dom.getElementsByTagName('enum'):
+      histogram_id = enum.attributes['name'].value
+      if (histogram_id in self.MODEL_CLASS.keys()):
+        browsed_histograms.append(histogram_id)
+        for child in enum.getElementsByTagName('int'):
+          data = {
+            'bucket_id': child.attributes['value'].value,
+            'property_name': child.attributes['label'].value
+          }
+          self._SaveData(data, histogram_id)
+
+    # Log an error if some histograms were not found.
+    if (len(list(set(browsed_histograms))) != len(self.MODEL_CLASS.keys())):
+      logging.error('Less histograms than expected were retrieved.')
+
 class FeatureHandler(common.ContentHandler):

  DEFAULT_URL = '/features'
@ -269,8 +334,6 @@ class FeatureHandler(common.ContentHandler):
    elif feature_id and 'new' in path:
      return self.redirect(self.ADD_NEW_URL)

-    feature = None
-
    template_data = {
        'feature_form': models.FeatureForm()
        }
@ -425,6 +488,7 @@ class FeatureHandler(common.ContentHandler):

 app = webapp2.WSGIApplication([
  ('/cron/metrics', YesterdayHandler),
+  ('/cron/histograms', HistogramsHandler),
  ('/(.*)/([0-9]*)', FeatureHandler),
  ('/(.*)', FeatureHandler),
 ], debug=settings.DEBUG)
--- a/app.yaml
+++ b/app.yaml
@ -46,7 +46,7 @@ handlers:
  script: google.appengine.ext.admin.application
  login: admin

- url: /cron/metrics
+- url: /cron/.*
  script: admin.app
  login: admin # Prevents raw access to this handler. Cron runs as admin.

--- a/cron.yaml
+++ b/cron.yaml
@ -1,4 +1,7 @@
 cron:
+- description: retrieve from chromium.googlesource.com chromium histograms
+  url: /cron/histograms
+  schedule: every day 04:00
 - description: retrieve from UMA Cloud Storage data gathered yesterday
  url: /cron/metrics
  schedule: every day 05:00
--- a/models.py
+++ b/models.py
@ -553,3 +553,24 @@ class AppUser(DictModel):
    d = self.to_dict()
    d['id'] = self.key().id()
    return d
+
+
+class HistogramModel(db.Model):
+  """Container for a histogram."""
+
+  bucket_id = db.IntegerProperty(required=True)
+  property_name = db.StringProperty(required=True)
+
+  @classmethod
+  def get_all(self):
+    output = {}
+    buckets = self.all().fetch(None)
+    for bucket in buckets:
+      output[bucket.bucket_id] = bucket.property_name
+    return output
+
+class CssPropertyHistogram(HistogramModel):
+  pass
+
+class FeatureObserverHistogram(HistogramModel):
+  pass
--- a/scripts/fix_data.py
+++ b/scripts/fix_data.py
@ -4,12 +4,12 @@
 # Copyright 2014 Google Inc. All Rights Reserved.

 import models
-import uma


 def CorrectPropertyName(bucket_id):
-  if bucket_id in uma.CSS_PROPERTY_BUCKETS:
-    return uma.CSS_PROPERTY_BUCKETS[bucket_id]
+  allCssPropertyHistograms = models.CssPropertyHistogram.get_all()
+  if bucket_id in allCssPropertyHistograms:
+    return allCssPropertyHistograms[bucket_id]
  return None

 def FetchAllPropertiesWithError(bucket_id=None):
--- a/server.py
+++ b/server.py
@ -27,7 +27,6 @@ from google.appengine.api import users
 import common
 import models
 import settings
-import uma


 def normalized_name(val):
@ -174,11 +173,11 @@ class MainHandler(common.ContentHandler, common.JSONHandler):

      template_data['feature'] = feature
    elif path.startswith('metrics/css/timeline'):
-      properties = sorted(uma.CSS_PROPERTY_BUCKETS.items(), key=lambda x:x[1])
+      properties = sorted(models.CssPropertyHistogram.get_all().iteritems(), key=lambda x:x[1])
      template_data['CSS_PROPERTY_BUCKETS'] = json.dumps(
          properties, separators=(',',':'))
    elif path.startswith('metrics/feature/timeline'):
-      properties = sorted(uma.FEATUREOBSERVER_BUCKETS.items(), key=lambda x:x[1])
+      properties = sorted(models.FeatureObserverHistogram.get_all().iteritems(), key=lambda x:x[1])
      template_data['FEATUREOBSERVER_BUCKETS'] = json.dumps(
          properties, separators=(',',':'))

--- a/uma.py
+++ b/uma.py