Adding lookup tables scripts

2015-03-06 13:14:30 -06:00 · 2015-03-06 13:14:30 -06:00 · ed1766d749
--- a/lookup/generate_lookup_tables.sh
+++ b/lookup/generate_lookup_tables.sh
@ -0,0 +1,107 @@
+#!/bin/bash
+
+#######################################################################
+##
+##  This script generates lookup tables of partner distribution IDs 
+##  and search plugin names to be used in processing FHR data.
+##  
+##  In particular, these tables are useful in identifying partner
+##  builds from distribution ID strings, and identifying searches
+##  through a particular search provider. 
+##  
+##  Processing steps are as follows:
+##  - Parse out necessary information from online resources and 
+##    summarize it in a few tables stored as CSV files. 
+##  - Copy CSV files to Mozilla web server (app1) so that they 
+##    can be viewed within Mozilla.
+##  - Generate RData containing the lookup information reformatted
+##    appropriately for use in the R lookup functions.
+##  - Copy RData to HDFS location accessible by FHR rollups job.
+##  - Make RData accessible to other hala users for adhoc 
+##    FHR processing (via /usr/local/share/). 
+##  
+#######################################################################
+
+. ~/.bash_profile
+
+# Base dir for the script output.
+OUTPUT_DIR="$HOME/fhr-lookup"
+[[ -d $OUTPUT_DIR ]] || mkdir $OUTPUT_DIR
+# Log file.
+OUTPUT_LOG="$OUTPUT_DIR/generate_lookup_tables.log"
+
+exec > $OUTPUT_LOG 2>&1
+
+# CSV files to contain lookup tables generated by the Python script:
+# Full table of relevant info about partner builds
+# extracted from the repack configs.
+FULL_DISTRIB_INFO="$OUTPUT_DIR/full_partner_distrib_info.csv"
+
+# Table of partner names and corresponding distribution IDs 
+# used by that partner.
+DISTRIB_IDS_TABLE="$OUTPUT_DIR/partner_distrib_ids.csv"
+
+# Table of partner names and default search plugins
+# used across that partner's custom distributions.
+DISTRIB_SEARCH_TABLE="$OUTPUT_DIR/partner_distrib_search.csv"
+
+# Table of all official search plugins included across all Mozilla builds.
+# Lists shortname and full descriptive name for each.
+ALL_SEARCH_PLUGINS="$OUTPUT_DIR/official_search_plugins.csv"
+
+## RData file name to contain lookup objects.
+LOOKUP_RDATA_BASENAME="partner-search-lookup.RData"
+LOOKUP_RDATA="$OUTPUT_DIR/$LOOKUP_RDATA_BASENAME"
+
+# Shared locations:
+# Web - location is pulled from env variable in local profile.
+SHARED_WEB_LOCATION="$APP1_REF"
+# HDFS
+SHARED_HDFS_LOCATION="${HOME/home/user}/shared"
+
+echo "Running lookup table generation:  `date`"
+
+## Generate CSV files.
+echo "Generating CSV files..."
+python partners.py \
+    $FULL_DISTRIB_INFO \
+    $DISTRIB_IDS_TABLE \
+    $DISTRIB_SEARCH_TABLE \
+    $ALL_SEARCH_PLUGINS
+if [[ $? != 0 ]]; then
+    echo "There was an error generating CSV files. Exiting..."
+    exit 1
+fi
+echo "Done."
+
+## Generate RData.
+echo "Generating RData..."
+Rscript --vanilla package-lookups.R \
+    $DISTRIB_IDS_TABLE \
+    $DISTRIB_SEARCH_TABLE \
+    $ALL_SEARCH_PLUGINS \
+    $LOOKUP_RDATA
+if [[ $? != 0 ]]; then
+    echo "There was an error generating RData. Exiting..."
+    exit 1
+fi
+echo "Done."
+
+## Copy to app1.
+chmod 644 $OUTPUT_DIR/*.csv
+[[ -z $SHARED_WEB_LOCATION ]] || \
+    (scp $OUTPUT_DIR/*.csv $SHARED_WEB_LOCATION && \
+    echo "Copied CSVs to app1.")
+
+## Copy to HDFS.
+chmod 755 $LOOKUP_RDATA
+hadoop dfs -copyFromLocal $LOOKUP_RDATA \
+    "$SHARED_HDFS_LOCATION/$LOOKUP_RDATA_BASENAME" && \
+    echo "Copied RData to HDFS."
+    
+echo "Lookup table generation completed: `date`"
+echo "Exiting..."
+exit 0
+
+
+
--- a/lookup/package-lookups.R
+++ b/lookup/package-lookups.R
@ -0,0 +1,32 @@
+#######################################################################
+###  
+###  Package lookup tables into an RData for use in FHR processing.
+###  Expects 3 command-line args:
+###    - CSV of partner names and IDs
+###    - CSV of partner names and search defaults
+###    - CSV of all search plugins and names
+###    - RData file to output to.
+###  
+#######################################################################
+
+library(data.table)
+files <- commandArgs(TRUE)
+
+## Current and expired partner IDs.
+## Lookup table is stored as named vector.
+## Identify expired partner builds with the suffix "|expired".
+partner.ids <- as.data.table(read.csv(files[1], stringsAsFactors = FALSE))
+partner.ids[type == "expired", partner := sprintf("%s|expired", partner)]
+partner.list <- partner.ids[, setNames(partner, distrib_id)]
+
+## Partner search plugins.
+partner.plugins <- as.data.table(read.csv(files[2], stringsAsFactors = FALSE))
+partner.plugins[type == "expired", partner := sprintf("%s|expired", partner)]
+partner.plugins <- split(partner.plugins$search_name, partner.plugins$partner)
+    
+## All official search plugins.
+official.plugins <- read.csv(files[3], stringsAsFactors = FALSE)[, "plugin_id"]
+
+save(partner.list, partner.plugins, official.plugins, 
+    file = files[4])   
+   
--- a/lookup/partners.py
+++ b/lookup/partners.py
@ -0,0 +1,319 @@
+
+""" 
+Download and parse latest list of partner distributions and search plugins.
+Script expects 4 command-line arguments containing 4 output CSV file names:
+  - table of full partner build info
+  - table of partner build distribution IDs
+  - table of partner build search plugins
+  - table of Firefox official search plugins
+provided in that order.
+"""
+
+import urllib2
+from HTMLParser import HTMLParser
+import ConfigParser
+import codecs
+import json
+import sys
+from operator import itemgetter
+
+
+# Base dir containing partner repack config files.
+partner_packs_url = 'https://hg.mozilla.org/build/partner-repacks/file/tip/'
+# Localization config containing search plugins. 
+spv_url = 'http://l10n.mozilla-community.org/~flod/p12n/searchplugins.json'
+
+# Script extracts partner info from packing configs.
+# Create separate lists for current and expired partner builds. 
+partner_dirs = {
+    'current': 'partners',
+    'expired': 'inactive-configs'
+}
+
+# Headers for output CSVs.
+# Filenames will be read from command-line args and added for each entry
+# with the key 'file'.
+output = {
+    # Table containing partner package information - 
+    # distribution IDs and search default. 
+    'full_package_info': {
+        'headers': [
+            'type',
+            'pack_name',
+            'partner',
+            'distrib_id',
+            'repack_distrib_id',
+            'search_default'
+        ]
+    },
+    # List of partner distrib ID strings by partner 
+    # (set of unique IDs based on both config files)
+    'distrib_ids': {
+        'headers': [
+            'type',
+            'partner',
+            'distrib_id'
+        ]
+    },
+    # List of partner search defaults.
+    'distrib_search': {
+        'headers': [
+            'type',
+            'partner',
+            'search_name'
+        ]
+    },
+    # Table of localized partner search strings.
+    'all_search': {
+        'headers': [
+            'plugin_id',
+            'search_name'
+        ]
+    }
+}
+
+
+class PartnerListParser(HTMLParser):
+    """ Parser implementation to extract package subdir URLs 
+        from package dir page.
+    """
+    def get_partner_links(self, dirname):
+        """ Parse HTML directory listing of partner build packages.
+            Page is a table of build names linking to build configs dir.
+            Pass in the subdir name containing partner build dirs of interest.
+            Returns a dict mapping partner build names to config dir URLs.
+        """
+        self.dirname = '/' + dirname + '/'
+        # Maintain cache of current href attribute from 'a' tags 
+        # to know when to store data.
+        self.current_link = None
+        # Output mapping.
+        self.partner_links = {}
+        
+        html = urllib2.urlopen(partner_packs_url + dirname).read()
+        self.feed(html)
+        
+        return self.partner_links
+    
+    def handle_starttag(self, tag, attrs):
+        """ Look for anchor tags pointing to package subdirs.
+            Cache URL when such a tag is encountered. """
+        if tag == 'a':
+            # Find href URLs containing self.dirname.
+            href = [att[1] for att in attrs if att[0] == 'href'][0]
+            if self.dirname in href:
+                self.current_link = href
+    
+    def handle_endtag(self, tag):
+        """ Clear current URL on tag close. """
+        if tag == 'a' and self.current_link is not None:
+            self.current_link = None
+    
+    def handle_data(self, data):
+        """ Store package dir name as the non-trivial text content 
+            of the 'a' tag. """
+        if self.current_link is not None:
+            if data is not None and len(data) > 0 and data != 'files':
+                if data not in self.partner_links:
+                    self.partner_links[data] = self.current_link
+
+
+class FakeSecHead(object):
+    """ Add fake section header for repack.cfg, 
+        so that these files play well with ConfigParser.
+        Adds section header 'FakeSection' to beginning of file on reading.
+        Taken from 'http://stackoverflow.com/questions/2819696/
+            parsing-properties-file-in-python'
+    """
+    def __init__(self, fp):
+        self.fp = fp
+        self.first_line = True
+    def readline(self):
+        if self.first_line:
+            self.first_line = False
+            return u'[FakeSection]\n'
+        return self.fp.readline()
+
+
+class UnicodeUrl:
+    """ Ensure files downloaded from remote URLs are read as UTF-8. 
+        Necessary for correctly reading search plugin names. 
+    """
+    def __init__(self, url):
+        self.fp = urllib2.urlopen(url)
+    def readline(self):
+        return codecs.decode(self.fp.readline(), 'utf8', 'replace')
+
+
+# Fields to look for in config files: (<section>, <option>)
+config_options = { 
+    'distrib_id': ('Global', 'id'),
+    'partner': ('Preferences', 'mozilla.partner.id'),
+    'repack_distrib_id': ('FakeSection', 'dist_id')
+}
+search_option = ('LocalizablePreferences', 'browser.search.defaultenginename')
+
+
+# Encode and write list to a file as CSV row.
+def write_row(fp, row):
+    fp.write(','.join(row).encode('utf8') + '\n')
+
+
+def generate_partner_tables():
+    """ Download and parse necessary information from partner build configs.
+        Organize information into tables and print to output files.
+    """
+    html_parser = PartnerListParser()
+    partner_info = []
+    
+    for partner_type in partner_dirs:
+        # For each group of partners, 
+        # get list of partner IDs and default search plugins.
+        config_urls = html_parser.get_partner_links(partner_dirs[partner_type])
+        
+        for name in sorted(config_urls.keys()):
+            link = config_urls[name].replace('/file/', '/raw-file/')
+            link = 'https://hg.mozilla.org' + link
+            vals = { 'type': partner_type, 'pack_name': name }
+            
+            # Parse config files.
+            configs = ConfigParser.RawConfigParser()
+            configs.readfp(UnicodeUrl(link + '/distribution/distribution.ini'))
+            configs.readfp(FakeSecHead(UnicodeUrl(link + '/repack.cfg')))
+            
+            # Read partner distribution ID strings.
+            for co in config_options:
+                try:
+                    vals[co] = configs.get(*config_options[co]).strip('"')
+                except:
+                    pass
+            
+            # Fill in missing mozilla partner IDs using package name.
+            # Common partner ID is part before underscore.
+            if 'partner' not in vals:
+                vals['partner'] = name.split('_')[0]
+            
+            # Special case for MSN - partner ID is different 
+            # for each distribution (localized). Set it explicitly.
+            if name.startswith('msn'):
+                vals['partner'] = 'msn'
+            
+            # Search default could be in any LocalizablePreferences section -
+            # add entry for each.
+            searchdefs = []
+            locprefs = [section for section in configs.sections() 
+                                    if section.startswith(search_option[0])]
+            for locprefs_sec in locprefs:
+                try:
+                    searchdefs.append(configs.get(locprefs_sec, 
+                                                  search_option[1]))
+                except:
+                    pass
+            
+            if len(searchdefs) == 0:
+                # No default search plugins. Add previously collected vals.
+                partner_info.append(vals)
+            else:
+                # Add row entry copying previous vals for each search default.
+                for sd in searchdefs:
+                    vals['search_default'] = sd
+                    partner_info.append(dict(vals))
+    
+    # Separate out distrib IDs and search plugins by partner name.
+    distrib_ids = {}
+    search_names = {}
+    
+    for entry in partner_info:
+        key = (entry['type'], entry['partner'])
+        if key not in distrib_ids:
+            distrib_ids[key] = set()
+        # Add distribution IDs from both sources.
+        distrib_ids[key].add(entry['distrib_id'])
+        distrib_ids[key].add(entry['repack_distrib_id'])
+        if 'search_default' in entry:
+            if key not in search_names:
+                search_names[key] = set()
+            search_names[key].add(entry['search_default'])
+    
+    # Output to CSV files.
+    # Full table.
+    with open(output['full_package_info']['file'], 'w') as f:
+        write_row(f, output['full_package_info']['headers'])
+        for row in partner_info:
+            row = [row.get(key, '') 
+                        for key in output['full_package_info']['headers']]
+            write_row(f, row)
+        f.close()
+    
+    # Distribution IDs by partner.
+    with open(output['distrib_ids']['file'], 'w') as f:
+        write_row(f, output['distrib_ids']['headers'])
+        for p in sorted(distrib_ids.keys()):
+            ids = list(distrib_ids[p])
+            ids.sort()
+            for id in ids:
+                write_row(f, [p[0], p[1], id])
+        f.close()
+    
+    # Search plugins by partner.
+    with open(output['distrib_search']['file'], 'w') as f:
+        write_row(f, output['distrib_search']['headers'])
+        for p in sorted(search_names.keys()):
+            ids = list(search_names[p])
+            ids.sort()
+            for id in ids:
+                write_row(f, [p[0], p[1], id])
+        f.close()
+
+
+def generate_plugin_list():
+    """ Download and parse necessary information on search plugins from
+        localization JSON. Organize information into a table and 
+        print to output file.
+    """
+    spv_json = json.load(urllib2.urlopen(spv_url))
+    plugins = set()
+    
+    # Extract search plugin IDs and full names.
+    # Search plugins are nested down multiple levels.
+    for loc in spv_json['locales']:
+        for app in spv_json['locales'][loc]:
+            for channel in spv_json['locales'][loc][app]:
+                spv_list = spv_json['locales'][loc][app][channel]
+                spv_list = spv_list['searchplugins']
+                for plugin_id in spv_list:
+                    plugins.add((plugin_id, 
+                                 '"%s"' % spv_list[plugin_id]['name']))
+    
+    plugins = list(plugins)
+    plugins.sort(key = itemgetter(0,1))
+    
+    with open(output['all_search']['file'], 'w') as f:
+        write_row(f, output['all_search']['headers'])
+        for p in plugins:
+            write_row(f, p)
+        f.close()
+
+
+def main(args):
+    """ Read file names from command-line args and generate tables.
+    """
+    args = zip([
+        'full_package_info',
+        'distrib_ids',
+        'distrib_search',
+        'all_search'
+    ], args)
+    for a in args:
+        output[a[0]]['file'] = a[1]
+        
+    generate_partner_tables()
+    generate_plugin_list()
+
+if __name__ == '__main__':
+    # Check that all output files are specified.
+    if len(sys.argv) < 5:
+        sys.exit('Usage requires 4 command-line args' +
+                 ' that are output file names')
+    main(sys.argv[1:])
+