Bulk add library

2019-03-30 02:57:28 -05:00 · 2019-03-30 02:57:28 -05:00 · 180d0d9aa0
--- a/.editorconfig
+++ b/.editorconfig
@ -0,0 +1,21 @@
+# http://editorconfig.org
+
+root = true
+
+[*]
+indent_style = space
+indent_size = 4
+trim_trailing_whitespace = true
+insert_final_newline = true
+charset = utf-8
+end_of_line = lf
+
+[*.bat]
+indent_style = tab
+end_of_line = crlf
+
+[LICENSE]
+insert_final_newline = false
+
+[Makefile]
+indent_style = tab
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@ -0,0 +1,15 @@
+* Dye Score version:
+* Python version:
+* Operating System:
+
+### Description
+
+Describe what you were trying to get done.
+Tell us what happened, what went wrong, and what you expected to happen.
+
+### What I Did
+
+```
+Paste the command(s) you ran and the output.
+If there was a crash, please include the traceback here.
+```
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,102 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,17 @@
+# Config file for automatic testing at travis-ci.org
+
+language: python
+python:
+  - 3.7
+  - 3.6
+  - 3.5
+  - 3.4
+
+install:
+  - pip install -r requirements-dev.txt
+  - python setup.py install
+
+script:
+  - py.test
+
+
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@ -0,0 +1,128 @@
+.. highlight:: shell
+
+============
+Contributing
+============
+
+Contributions are welcome, and they are greatly appreciated! Every little bit
+helps, and credit will always be given.
+
+You can contribute in many ways:
+
+Types of Contributions
+----------------------
+
+Report Bugs
+~~~~~~~~~~~
+
+Report bugs at https://github.com/birdsarah/dye_score/issues.
+
+If you are reporting a bug, please include:
+
+* Your operating system name and version.
+* Any details about your local setup that might be helpful in troubleshooting.
+* Detailed steps to reproduce the bug.
+
+Fix Bugs
+~~~~~~~~
+
+Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
+wanted" is open to whoever wants to implement it.
+
+Implement Features
+~~~~~~~~~~~~~~~~~~
+
+Look through the GitHub issues for features. Anything tagged with "enhancement"
+and "help wanted" is open to whoever wants to implement it.
+
+Write Documentation
+~~~~~~~~~~~~~~~~~~~
+
+Dye Score could always use more documentation, whether as part of the
+official Dye Score docs, in docstrings, or even on the web in blog posts,
+articles, and such.
+
+Submit Feedback
+~~~~~~~~~~~~~~~
+
+The best way to send feedback is to file an issue at https://github.com/birdsarah/dye_score/issues.
+
+If you are proposing a feature:
+
+* Explain in detail how it would work.
+* Keep the scope as narrow as possible, to make it easier to implement.
+* Remember that this is a volunteer-driven project, and that contributions
+  are welcome :)
+
+Get Started!
+------------
+
+Ready to contribute? Here's how to set up `dye_score` for local development.
+
+1. Fork the `dye_score` repo on GitHub.
+2. Clone your fork locally::
+
+    $ git clone git@github.com:your_name_here/dye_score.git
+
+3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
+
+    $ mkvirtualenv dye_score
+    $ cd dye_score/
+    $ python setup.py develop
+
+4. Create a branch for local development::
+
+    $ git checkout -b name-of-your-bugfix-or-feature
+
+   Now you can make your changes locally.
+
+5. When you're done making changes, check that your changes pass flake8 and the
+   tests, including testing other Python versions with tox::
+
+    $ flake8 dye_score tests
+    $ python setup.py test or py.test
+    $ tox
+
+   To get flake8 and tox, just pip install them into your virtualenv.
+
+6. Commit your changes and push your branch to GitHub::
+
+    $ git add .
+    $ git commit -m "Your detailed description of your changes."
+    $ git push origin name-of-your-bugfix-or-feature
+
+7. Submit a pull request through the GitHub website.
+
+Pull Request Guidelines
+-----------------------
+
+Before you submit a pull request, check that it meets these guidelines:
+
+1. The pull request should include tests.
+2. If the pull request adds functionality, the docs should be updated. Put
+   your new functionality into a function with a docstring, and add the
+   feature to the list in README.rst.
+3. The pull request should work for Python 2.7, 3.4, 3.5 and 3.6, and for PyPy. Check
+   https://travis-ci.org/birdsarah/dye_score/pull_requests
+   and make sure that the tests pass for all supported Python versions.
+
+Tips
+----
+
+To run a subset of tests::
+
+$ py.test tests.test_dye_score
+
+
+Deploying
+---------
+
+A reminder for the maintainers on how to deploy.
+Make sure all your changes are committed (including an entry in HISTORY.rst).
+Then run::
+
+$ bumpversion patch # possible: major / minor / patch
+$ git push
+$ git push --tags
+
+Travis will then deploy to PyPI if tests pass.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,10 @@
+include CONTRIBUTING.rst
+include HISTORY.rst
+include LICENSE
+include README.rst
+
+recursive-include tests *
+recursive-exclude * __pycache__
+recursive-exclude * *.py[co]
+
+recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
--- a/81
+++ b/81
@ -0,0 +1,81 @@
+.PHONY: clean clean-test clean-pyc clean-build docs help
+.DEFAULT_GOAL := help
+
+define BROWSER_PYSCRIPT
+import os, webbrowser, sys
+
+from urllib.request import pathname2url
+
+webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
+endef
+export BROWSER_PYSCRIPT
+
+define PRINT_HELP_PYSCRIPT
+import re, sys
+
+for line in sys.stdin:
+	match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
+	if match:
+		target, help = match.groups()
+		print("%-20s %s" % (target, help))
+endef
+export PRINT_HELP_PYSCRIPT
+
+BROWSER := python -c "$$BROWSER_PYSCRIPT"
+
+help:
+	@python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
+
+clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
+
+clean-build: ## remove build artifacts
+	rm -fr build/
+	rm -fr dist/
+	rm -fr .eggs/
+	find . -name '*.egg-info' -exec rm -fr {} +
+	find . -name '*.egg' -exec rm -f {} +
+
+clean-pyc: ## remove Python file artifacts
+	find . -name '*.pyc' -exec rm -f {} +
+	find . -name '*.pyo' -exec rm -f {} +
+	find . -name '*~' -exec rm -f {} +
+	find . -name '__pycache__' -exec rm -fr {} +
+
+clean-test: ## remove test and coverage artifacts
+	rm -f .coverage
+	rm -fr htmlcov/
+	rm -fr .pytest_cache
+
+lint: ## check style with flake8
+	flake8 dye_score tests
+
+test: ## run tests quickly with the default Python
+	py.test
+
+coverage: ## check code coverage quickly with the default Python
+	coverage run --source dye_score -m pytest
+	coverage report -m
+	coverage html
+	$(BROWSER) htmlcov/index.html
+
+docs: ## generate Sphinx HTML documentation, including API docs
+	rm -f docs/dye_score.rst
+	rm -f docs/modules.rst
+	sphinx-apidoc -o docs/ dye_score
+	$(MAKE) -C docs clean
+	$(MAKE) -C docs html
+	$(BROWSER) docs/_build/html/index.html
+
+servedocs: docs ## compile the docs watching for changes
+	watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
+
+release: dist ## package and upload a release
+	twine upload dist/*
+
+dist: clean ## builds source and wheel package
+	python setup.py sdist
+	python setup.py bdist_wheel
+	ls -l dist
+
+install: clean ## install the package to the active Python's site-packages
+	python setup.py install
--- a/README.md
+++ b/README.md
@ -1,2 +0,0 @@
-# dye-score
-Utilities to build the dye-score metric from OpenWPM javascript call data.
--- a/README.rst
+++ b/README.rst
@ -0,0 +1,8 @@
+=========
+Dye Score
+=========
+
+Utilities to build the dye-score metric from OpenWPM_ javascript call data.
+
+
+.. _OpenWPM: https://github.com/mozilla/openwpm
--- a/docs/Makefile
+++ b/docs/Makefile
@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = python -msphinx
+SPHINXPROJ    = dye_score
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/conf.py
+++ b/docs/conf.py
@ -0,0 +1,159 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# dye_score documentation build configuration file, created by
+# sphinx-quickstart on Fri Jun  9 13:47:02 2017.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another
+# directory, add these directories to sys.path here. If the directory is
+# relative to the documentation root, use os.path.abspath to make it
+# absolute, like shown here.
+#
+import os
+import sys
+sys.path.insert(0, os.path.abspath('..'))
+
+import sphinx_rtd_theme
+import dye_score
+
+# -- General configuration ---------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'Dye Score'
+copyright = u"2019, Sarah Bird"
+author = u"Sarah Bird"
+
+# The version info for the project you're documenting, acts as replacement
+# for |version| and |release|, also used in various other places throughout
+# the built documents.
+#
+# The short X.Y version.
+version = dye_score.__version__
+# The full version, including alpha/beta/rc tags.
+release = dye_score.__version__
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output -------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Theme options are theme-specific and customize the look and feel of a
+# theme further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'dye_scoredoc'
+
+
+# -- Options for LaTeX output ------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass
+# [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'dye_score.tex',
+     u'Dye Score Documentation',
+     u'Sarah Bird', 'manual'),
+]
+
+
+# -- Options for manual page output ------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'dye_score',
+     u'Dye Score Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output ----------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'dye_score',
+     u'Dye Score Documentation',
+     author,
+     'dye_score',
+     'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+
--- a/docs/contributing.rst
+++ b/docs/contributing.rst
@ -0,0 +1 @@
+.. include:: ../CONTRIBUTING.rst
--- a/docs/dye_score.rst
+++ b/docs/dye_score.rst
@ -0,0 +1,7 @@
+API Reference
+=============
+
+.. automodule:: dye_score
+    :members:
+    :undoc-members:
+    :show-inheritance:
--- a/docs/index.rst
+++ b/docs/index.rst
@ -0,0 +1,18 @@
+Welcome to Dye Score's documentation!
+======================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   readme
+   installation
+   usage
+   modules
+   contributing
+
+Indices and tables
+==================
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
--- a/docs/installation.rst
+++ b/docs/installation.rst
@ -0,0 +1,49 @@
+.. highlight:: shell
+
+============
+Installation
+============
+
+Prerequisites
+-------------
+
+You will need `Apache Spark`_ available on your system. PySpark will be
+installed when you install dye_score.
+
+
+Stable release
+--------------
+
+To install Dye Score, run this command in your terminal:
+
+.. code-block:: console
+
+    $ pip install dye_score
+
+From sources
+------------
+
+The sources for Dye Score can be downloaded from the `Github repo`_.
+
+You can either clone the public repository:
+
+.. code-block:: console
+
+    $ git clone git://github.com/mozilla/dye_score
+
+Or download the `tarball`_:
+
+.. code-block:: console
+
+    $ curl  -OL https://github.com/mozilla/dye_score/tarball/master
+
+Once you have a copy of the source, you can install it with:
+
+.. code-block:: console
+
+    $ python setup.py install
+
+
+.. _Apache spark: https://spark.apache.org/downloads.html
+.. _Github repo: https://github.com/mozilla/dye_score
+.. _tarball: https://github.com/mozilla/dye_score/tarball/master
--- a/docs/make.bat
+++ b/docs/make.bat
@ -0,0 +1,36 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=python -msphinx
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+set SPHINXPROJ=dye_score
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
+	echo.then set the SPHINXBUILD environment variable to point to the full
+	echo.path of the 'sphinx-build' executable. Alternatively you may add the
+	echo.Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
--- a/docs/modules.rst
+++ b/docs/modules.rst
@ -0,0 +1,7 @@
+dye_score
+=========
+
+.. toctree::
+   :maxdepth: 4
+
+   dye_score
--- a/docs/readme.rst
+++ b/docs/readme.rst
@ -0,0 +1 @@
+.. include:: ../README.rst
--- a/docs/usage.rst
+++ b/docs/usage.rst
@ -0,0 +1,7 @@
+=====
+Usage
+=====
+
+To use Dye Score in a project::
+
+    import dye_score
--- a/dye_score/init.py
+++ b/dye_score/init.py
@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+
+from .dye_score import DyeScore
+
+__author__ = """Sarah Bird"""
+__email__ = 'fx-data-dev@mozilla.org'
+__version__ = '0.1.0'
+
+__all__ = ['DyeScore']
--- a/dye_score/distances.py
+++ b/dye_score/distances.py
@ -0,0 +1,8 @@
+import numpy as np
+
+
+def get_chebyshev_distances_xarray_ufunc(df_array, df_dye_array):
+    def chebyshev(x):
+        return np.abs(df_array[:, 0, :] - x).max(axis=1)
+    result = np.apply_along_axis(chebyshev, 1, df_dye_array).T
+    return result
--- a/dye_score/dye_score.py
+++ b/dye_score/dye_score.py
@ -0,0 +1,407 @@
+# -*- coding: utf-8 -*-
+import os
+import numpy as np
+import shutil
+import yaml
+
+from dask.dataframe import (
+    from_pandas,
+    read_csv,
+    read_parquet,
+)
+from pprint import pprint
+from xarray import (
+    apply_ufunc,
+    DataArray,
+    open_zarr,
+)
+try:
+    from pyspark.sql.functions import udf
+except ModuleNotFoundError:
+    print('PySpark not available for data processing.')
+
+from .distances import (
+    get_chebyshev_distances_xarray_ufunc,
+)
+from .utils import (
+    get_netloc,
+    get_path,
+    get_end_of_path,
+    get_clean_script,
+)
+
+
+def get_raw_snippet_from_row(row):
+    script_url = row.script_url
+    func_name = row.func_name
+    if script_url == '':
+        script_url = row.top_level_url
+    netloc = get_netloc(script_url)
+    path = get_path(script_url)
+    path_end = get_end_of_path(path)
+    return netloc + '||' + path_end + '||' + func_name
+
+
+class DyeScore:
+    __conf = {
+        "INPUT_PARQUET_LOCATION": "",
+        "DYESCORE_DATA_DIR": "",
+        "DYESCORE_RESULTS_DIR": "",
+        "USE_AWS": False,
+        "AWS_ACCESS_KEY_ID": "",
+        "AWS_SECRET_ACCESS_KEY": "",
+    }
+
+    dye_score_columns = [
+        'top_level_url',
+        'script_url',
+        'func_name',
+        'symbol',
+    ]
+
+    dye_score_files = {
+        'raw_snippet_call_df': 'raw_snippet_call_df.parquet',
+        'raw_snippet_to_snippet_lookup': 'snippet_lookup.parquet',
+        'snippets': 'snippets.zarr',
+        'snippet_dyeing_map': 'snippet_dyeing_map.parquet',
+    }
+
+    @property
+    def s3_storage_options(self):
+        if self.config('USE_AWS') is True:
+            return dict(
+                anon=False,
+                key=self.config('AWS_ACCESS_KEY_ID'),
+                secret=self.config('AWS_SECRET_ACCESS_KEY')
+            )
+        else:
+            return None
+
+    @property
+    def to_parquet_opts(self):
+        return dict(
+            compression='snappy', engine='pyarrow', storage_options=self.s3_storage_options
+        )
+
+    def __init__(self, config_file_path, validate_config=True):
+        """Sets up dye score config used to interact with your environment.
+
+        Holds references to file paths and private data such as AWS API keys.
+
+        Expects a YAML file with the following keys:
+            * INPUT_PARQUET_LOCATION - the location of the raw or sampled OpenWPM input parquet folder
+            * DYESCORE_DATA_DIR - location where you would like dye score to store data assets
+            * DYESCORE_RESULTS_DIR - location where you would like dye score to store results assets
+            * USE_AWS - default False - set true if data store is AWS
+            * AWS_ACCESS_KEY_ID - optional - for storing and retrieving data on AWS
+            * AWS_SECRET_ACCESS_KEY - optional - for storing and retrieving data on AWS
+
+        Locations can be a local file path or a bucket.
+        """
+        if not os.path.exists(config_file_path):
+            raise ValueError(f'config_file_path `{config_file_path}` not found')
+
+        with open(config_file_path, 'r') as f:
+            config = yaml.safe_load(f.read())
+
+        self.__conf['INPUT_PARQUET_LOCATION'] = config['INPUT_PARQUET_LOCATION']
+        self.__conf['DYESCORE_DATA_DIR'] = config['DYESCORE_DATA_DIR']
+        use_aws = config.get('USE_AWS', False)
+        self.__conf['USE_AWS'] = bool(use_aws)
+        self.__conf['AWS_ACCESS_KEY_ID'] = config.get('AWS_ACCESS_KEY_ID', '')
+        self.__conf['AWS_SECRET_ACCESS_KEY'] = config.get('AWS_SECRET_ACCESS_KEY', '')
+        pprint(DyeScore.__conf)
+        if validate_config is True:
+            self.validate_config()
+
+    def config(self, option):
+        """Used by dye score methods to retrieve config options"""
+        return self.__conf[option]
+
+    def dye_score_data_file(self, filename):
+        dyescoredir = self.config('DYESCORE_DATA_DIR')
+        path = os.path.join(dyescoredir, self.dye_score_files[filename])
+        return path
+
+    def validate_config(self):
+        if self.config('USE_AWS') is True:
+            assert self.config('INPUT_PARQUET_LOCATION').startswith('s3://')
+            assert self.config('DYESCORE_DATA_DIR').startswith('s3://')
+
+    def validate_input_data(self):
+        in_file = self.config('INPUT_PARQUET_LOCATION')
+        df = read_parquet(in_file, engine='pyarrow')
+        for column in self.dye_score_columns:
+            assert column in df.columns, f'{column} missing from df.columns ({df.columns})'
+            assert df[column].dtype == 'object', f'{column} does not have dtype `object`'
+        return True
+
+    def get_input_df(self, columns=None):
+        if not columns:
+            columns = self.dye_score_columns
+        in_file = self.config('INPUT_PARQUET_LOCATION')
+        df = read_parquet(in_file, columns=columns, engine='pyarrow')
+        return df
+
+    ##
+    # DATA PROCESSING
+    #
+    # In the next few methods we switch between dask and spark. We switch to spark
+    # whenever we need to leverage its superior performance handling strings.
+    ##
+
+    @staticmethod
+    def file_in_validation(inpath):
+        if not os.path.exists(inpath):
+            raise ValueError(f'File {inpath} does not exist. Cannot proceed.')
+
+    @staticmethod
+    def file_out_validation(outpath, override):
+        if os.path.exists(outpath) and override is False:
+            raise ValueError(f'File {outpath} already exists. Use `override=True` to remove and replace.')
+        if os.path.exists(outpath) and override is True:
+            print(f'Removing existing file {outpath}')
+            shutil.rmtree(outpath)
+
+    def build_raw_snippet_df(self, override=False):
+        """Builds raw_snippets from input data
+
+        Snippet function is ``script_url.netloc||script_url.path_end||func_name``
+        If script_url is missing, location is used.
+
+        Args:
+            override (bool): True to replace any existing outputs
+
+        Returns:
+            str. The file path where output is saved
+        """
+        # TODO Add add an issue to supply user generated snippet code
+
+        # File setup
+        inpath = self.config('INPUT_PARQUET_LOCATION')
+        outpath = self.dye_score_data_file('raw_snippet_call_df')
+        self.file_in_validation(inpath)
+        self.file_out_validation(outpath, override)
+        # Process
+        df = read_parquet(inpath, columns=self.dye_score_columns, engine='pyarrow')
+        df['raw_snippet'] = df.apply(get_raw_snippet_from_row, axis=1, meta='O')
+        df['called'] = 1
+        print(df.head())
+        df.to_parquet(outpath, **self.to_parquet_opts)
+        return outpath
+
+    def build_snippet_map(self, override=False):
+        """Builds snippet ids and saves map of ids to raw snippets
+
+        xarray cannot handle arbitrary length string indexes so we need to build a set of unique
+        ids to reference snippets. This method creates the ids and saves the map of raw ids to snippets.
+
+        Args:
+            override (bool): True to replace any existing outputs
+
+        Returns:
+            str. The file path where output is saved
+        """
+        # TODO File an issue - do we have a problem with duplicate snippets?
+
+        # File setup
+        inpath = self.dye_score_data_file('raw_snippet_call_df')
+        outpath = self.dye_score_data_file('raw_snippet_to_snippet_lookup')
+        self.file_in_validation(inpath)
+        self.file_out_validation(outpath, override)
+        # Process
+        df = read_parquet(inpath, columns=['raw_snippet'], engine='pyarrow')
+        snippet_lookup = df.raw_snippet.unique().to_frame()
+        snippet_lookup['snippet'] = snippet_lookup.raw_snippet.apply(lambda x: hash(x), meta='int64')
+        print(snippet_lookup.head())
+        snippet_lookup.to_parquet(outpath, **self.to_parquet_opts)
+        return outpath
+
+    def _load_and_join_raw_data_to_snippets(self, spark, columns=[], override=False):
+        # File setup
+        snippet_map = self.dye_score_data_file('raw_snippet_to_snippet_lookup')
+        inpath = self.dye_score_data_file('raw_snippet_call_df')
+        self.file_in_validation(snippet_map)
+        self.file_in_validation(inpath)
+
+        # Process - pivot with spark and save to tmp file
+        df_map = spark.read.parquet(snippet_map).select(['raw_snippet', 'snippet'])
+        df = spark.read.parquet(inpath)
+        if columns:
+            df = df.select(columns)
+        joined = df.join(df_map, on='raw_snippet')
+        joined = joined.drop('raw_snippet')
+        return joined
+
+    def build_snippets(self, spark, override=False):
+        """Builds row-normalized snippet dataset
+
+        Dimensions are n snippets x s unique symbols in dataset.
+
+        Data is output in zarr format with processing by spark, dask, and xarray.
+
+        Creates an intermediate tmp file when converting from spark to dask.
+
+        Args:
+            spark (pyspark.sql.session.SparkSession): spark instance
+            override (bool): True to replace any existing outputs
+
+        Returns:
+            str. The file path where output is saved
+        """
+        # TODO Get an issue to run everything on S3
+
+        spark.conf.set("spark.sql.execution.arrow.enabled", "true")
+
+        # File setup
+        outpath = self.dye_score_data_file('snippets')
+        self.file_out_validation(outpath, override)
+
+        # Process - pivot with spark and save to tmp file
+        df_to_pivot = self._load_and_join_raw_data_to_snippets(
+            spark, columns=['symbol', 'called', 'raw_snippet'], override=override
+        )
+        symbols = df_to_pivot.select('symbol').distinct().toPandas()
+        symbols = sorted(list(symbols.symbol.values))
+        print(f'Dataset has {len(symbols)} unique symbols')
+        pivot = df_to_pivot.groupBy('snippet').pivot('symbol', symbols).sum('called')
+        pivot = pivot.na.fill(0)
+
+        tmp = 'tmp.csv'
+        if os.path.exists(tmp):
+            shutil.rmtree(tmp)
+        pivot.write.csv(tmp, header=True)
+
+        # Process - set_index, normalize and save to zarr
+        dtypes = {symbol: 'float64' for symbol in symbols}
+        dtypes['snippet'] = 'object'
+        pivot_table = read_csv(f'{tmp}/*.csv', dtype=dtypes)
+        pivot_table = pivot_table.set_index('snippet')
+
+        row_normalize = pivot_table.div(pivot_table.sum(axis=1), axis=0)
+        row_normalize_array = DataArray(
+                row_normalize,
+                dims=['snippet', 'symbol'],
+                coords={
+                    'snippet': row_normalize.index.values,
+                    'symbol': row_normalize.columns
+                }
+        )
+        print(row_normalize_array)
+        row_normalize_array.to_dataset(name='data').to_zarr(store=outpath)
+        # Cleanup
+        shutil.rmtree(tmp)
+        return outpath
+
+    def build_snippet_snippet_dyeing_map(self, spark, override=False):
+        """Build file used to join snippets to data for dyeing.
+
+        Adds clean_script field to dataset. Saves parquet file with:
+            * snippet - the int version, not raw_snippet
+            * top_level_url
+            * script_url
+            * clean_script
+
+        Args:
+            spark (pyspark.sql.session.SparkSession): spark instance
+            override (bool): True to replace any existing outputs
+
+        Returns:
+            str. The file path where output is saved
+
+        """
+        spark.conf.set("spark.sql.execution.arrow.enabled", "true")
+
+        # File setup
+        outpath = self.dye_score_data_file('snippet_dyeing_map')
+        self.file_out_validation(outpath, override)
+
+        # Process
+        df = self._load_and_join_raw_data_to_snippets(
+            spark, columns=['top_level_url', 'script_url', 'func_name', 'raw_snippet'], override=override
+        )
+        get_clean_script_udf = udf(get_clean_script)
+        df = df.withColumn('clean_script', get_clean_script_udf(df.script_url))
+        df = df.dropDuplicates()
+        df.write.parquet(outpath, compression='snappy')
+        return outpath
+
+    ##
+    # Dyeing and Scoring
+    ##
+
+    def compute_distances_for_dye_snippets(self, dye_snippets, filename_suffix='dye_snippets', override=False):
+        # File setup
+        snippet_file = self.dye_score_data_file('snippets')
+        self.file_in_validation(snippet_file)
+        resultsdir = self.config('DYESCORE_RESULTS_DIR')
+        file_name = f'snippets_dye_distances_from_{filename_suffix}'
+        outpath = os.path.join(resultsdir, file_name)
+        self.file_out_validation(outpath, override)
+
+        # Process distances
+        df = open_zarr(store=snippet_file)['data']
+        df = df.chunk({'symbol': -1})
+        df_c = df.chunk({'snippet': 10_000})
+
+        df_dye = df.loc[{'snippet': dye_snippets}]
+        df_dye = df_dye.rename({'snippet': 'dye_snippet'})
+        df_dye_c = df_dye.chunk({'dye_snippet': 100})
+
+        distance_array = apply_ufunc(
+            get_chebyshev_distances_xarray_ufunc,
+            df_c, df_dye_c,
+            dask='parallelized',
+            output_dtypes=[float],
+            input_core_dims=[['symbol'], ['symbol']],
+        )
+        print(distance_array)
+        distance_array.to_dataset(name='data').to_zarr(store=outpath)
+        return outpath
+
+    def compute_snippets_scores_for_thresholds(self, thresholds, filename_suffix='dye_snippets', override=False):
+        resultsdir = self.config('DYESCORE_RESULTS_DIR')
+        file_name = f'snippets_dye_distances_from_{filename_suffix}'
+        inpath = os.path.join(resultsdir, file_name)
+        self.file_in_validation(inpath)
+        distance_array = open_zarr(store=inpath)['data']
+
+        # TODO Make issue to not hard code this
+        LEAKY_THRESHOLD = 0.2
+        n_sites = distance_array.shape[0]
+        N_LEAKY_THRESHOLD = LEAKY_THRESHOLD * n_sites
+
+        outpaths = []
+        for threshold in thresholds:
+            n_to_dye = np.sum(distance_array < threshold, axis=0).persist()
+            non_leaky_sites = n_to_dye[n_to_dye < N_LEAKY_THRESHOLD].coords.to_index()
+            distance_array_filtered = distance_array.loc[{'dye_snippet': non_leaky_sites}]
+
+            site_counts = np.sum(distance_array_filtered < threshold, axis=1)
+            site_counts_df = site_counts.to_dataframe()
+            site_counts_df = site_counts_df.reset_index().rename(columns={'data': 'dye_count'})
+            site_counts_df['snippet'] = site_counts_df.snippet.astype(int)
+            outpath = os.path.join(resultsdir, f'snippets_score_from_{filename_suffix}_{threshold}')
+            self.file_out_validation(outpath, override)
+            from_pandas(site_counts_df, npartitions=1).to_parquet(outpath, **self.to_parquet_opts)
+            outpaths.append(outpath)
+        return outpaths
+
+    def compute_dye_score_for_thresholds(self, thresholds, filename_suffix='dye_snippets', override=False):
+        snippet_dyeing_map_file = self.dye_score_data_file('snippet_dyeing_map')
+        snippet_data = read_parquet(snippet_dyeing_map_file, engine='pyarrow')
+        resultsdir = self.config('DYESCORE_RESULTS_DIR')
+
+        outpaths = []
+        for threshold in thresholds:
+            inpath = os.path.join(resultsdir, f'snippets_score_from_{filename_suffix}_{threshold}')
+            outpath = os.path.join(resultsdir, f'dye_score_from_{filename_suffix}_{threshold}.csv.gz')
+            self.file_out_validation(outpath, override)
+
+            site_counts_df = read_parquet(inpath)
+            script_to_dye = snippet_data.merge(site_counts_df, on='snippet')
+            script_to_dye_max = script_to_dye[['clean_script', 'dye_count']].groupby('clean_script').max()
+            script_to_dye_max = script_to_dye_max.rename(columns={'dye_count': 'dye_score'})
+            script_to_dye_max.to_csv(outpath, compression='gzip')
+            outpaths.append(outpath)
+        return outpaths
--- a/dye_score/utils.py
+++ b/dye_score/utils.py
@ -0,0 +1,36 @@
+from urllib.parse import urlparse
+
+EMPTY_STRING = 'EMPTY_STRING'
+
+
+def get_netloc(x):
+    p = urlparse(x)
+    val = p.netloc
+    if len(val) == 0:
+        val = EMPTY_STRING
+    return val
+
+
+def get_path(x):
+    p = urlparse(x)
+    val = p.path
+    if len(val) == 0:
+        val = EMPTY_STRING
+    return val
+
+
+def get_end_of_path(x):
+    splits = x.split('/')
+    val = ''
+    if len(splits) > 0:
+        val = splits[-1]
+    else:
+        val = x
+    if len(val) == 0:
+        val = EMPTY_STRING
+    return val
+
+
+def get_clean_script(x):
+    p = urlparse(x)
+    return f'{p.netloc}{p.path}'
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@ -0,0 +1,10 @@
+bumpversion==0.5.3
+coverage==4.5.1
+flake8==3.5.0
+pytest==3.8.2
+pytest-runner==4.2
+Sphinx==1.8.1
+sphinx_rtd_theme==0.4.3
+twine==1.12.1
+watchdog==0.9.0
+wheel==0.32.1
--- a/setup.cfg
+++ b/setup.cfg
@ -0,0 +1,26 @@
+[bumpversion]
+current_version = 0.1.0
+commit = True
+tag = True
+
+[bumpversion:file:setup.py]
+search = version='{current_version}'
+replace = version='{new_version}'
+
+[bumpversion:file:dye_score/__init__.py]
+search = __version__ = '{current_version}'
+replace = __version__ = '{new_version}'
+
+[bdist_wheel]
+universal = 1
+
+[flake8]
+exclude = docs
+
+[aliases]
+# Define setup.py command aliases here
+test = pytest
+
+[tool:pytest]
+collect_ignore = ['setup.py']
+addopts = --disable-warnings
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,50 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""The setup script."""
+
+from setuptools import setup, find_packages
+
+with open('README.rst') as readme_file:
+    readme = readme_file.read()
+
+requirements = [
+    'pyyaml==3.13',
+    'dask[complete]==1.1.5',
+    'pyarrow==0.12.1',
+    'pyspark==2.4.0',
+    'xarray==0.12.0',
+    'zarr==2.2.0',
+]
+
+setup_requirements = ['pytest-runner', ]
+
+test_requirements = ['pytest', ]
+
+setup(
+    author="Sarah Bird",
+    author_email='fx-data-dev@mozilla.org',
+    classifiers=[
+        'Development Status :: 2 - Pre-Alpha',
+        'Intended Audience :: Researchers',
+        'Natural Language :: English',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+    ],
+    description="Utilities to build the dye-score metric from OpenWPM javascript call data.",
+    install_requires=requirements,
+    long_description=readme,
+    include_package_data=True,
+    keywords='dye_score',
+    name='dye_score',
+    packages=find_packages(include=['dye_score']),
+    setup_requires=setup_requirements,
+    test_suite='tests',
+    tests_require=test_requirements,
+    url='https://github.com/mozilla/dye-score',
+    version='0.1.0',
+    zip_safe=False,
+)
--- a/tests/assets/invalid_config.yaml
+++ b/tests/assets/invalid_config.yaml
@ -0,0 +1,3 @@
+AWS_ACCSES_KEY_ID: 
+
+jgdflkgjsld;gs
--- a/tests/assets/valid_config.yaml
+++ b/tests/assets/valid_config.yaml
@ -0,0 +1,5 @@
+INPUT_PARQUET_LOCATION: s3://inputdir
+DYESCORE_DATA_DIR: s3://outputdir
+USE_AWS: True
+AWS_ACCESS_KEY_ID: jgdflkgjsld;gs
+AWS_SECRET_ACCESS_KEY: dsil;guewort9q9vkdf/
--- a/tests/assets/valid_config_aws_false.yaml
+++ b/tests/assets/valid_config_aws_false.yaml
@ -0,0 +1,4 @@
+INPUT_PARQUET_LOCATION: inputdir
+DYESCORE_DATA_DIR: outputdir
+USE_AWS: False
+
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*
+
+import os
+import pytest
+
+
+@pytest.fixture
+def asset_dir():
+    test_dir = os.path.dirname(os.path.abspath(__file__))
+    return os.path.join(test_dir, 'assets')
+
+
+@pytest.fixture
+def sample_config():
+    return {
+        "INPUT_PARQUET_LOCATION": "",
+        "DYESCORE_DATA_DIR": "",
+        "USE_AWS": False,
+        "AWS_ACCESS_KEY_ID": "",
+        "AWS_SECRET_ACCESS_KEY": "",
+    }
--- a/tests/test_dye_score.py
+++ b/tests/test_dye_score.py
@ -0,0 +1,83 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import dask.dataframe as dd
+import os
+import pandas as pd
+import pytest
+import yaml
+
+from pyarrow.lib import ArrowIOError
+from yaml.scanner import ScannerError
+
+from dye_score import DyeScore
+
+
+##
+# Test Data Validation
+##
+
+def test_data_validation_with_invalid_file(tmpdir, sample_config):
+    # Set-up invalid data file and save config
+    data_dir = os.path.join(tmpdir, 'data.csv')
+    config_file = os.path.join(tmpdir, 'config.yaml')
+    df = pd.DataFrame({'a': [1, 2, 3]})
+    df.to_csv(data_dir)
+    sample_config['INPUT_PARQUET_LOCATION'] = data_dir
+    with open(config_file, 'w') as f:
+        f.write(yaml.dump(sample_config))
+    ds = DyeScore(config_file)
+    # Test
+    with pytest.raises(ArrowIOError):
+        ds.validate_input_data()
+
+
+def test_data_validation_with_valid_file(tmpdir, sample_config):
+    # Set-up valid data file and save config
+    data_dir = os.path.join(tmpdir, 'data.parquet')
+    config_file = os.path.join(tmpdir, 'config.yaml')
+    df = pd.DataFrame({
+        'top_level_url': ['a', 'b'],
+        'script_url': ['c', 'd'],
+        'symbol': ['e', 'f'],
+        'func_name': ['g', 'h']
+    })
+    dfd = dd.from_pandas(df, npartitions=2)
+    dfd.to_parquet(data_dir)
+    sample_config['INPUT_PARQUET_LOCATION'] = data_dir
+    with open(config_file, 'w') as f:
+        f.write(yaml.dump(sample_config))
+    ds = DyeScore(config_file)
+    # Test
+    assert ds.validate_input_data() is True
+
+
+##
+# Test Config
+##
+
+def test_config_requires_valid_file():
+    with pytest.raises(ValueError):
+        DyeScore('test.yaml')
+
+
+def test_config_requires_valid_yaml(asset_dir):
+    with pytest.raises(ScannerError):
+        DyeScore(os.path.join(asset_dir, 'invalid_config.yaml'))
+
+
+def test_config_sets_properties(asset_dir):
+    ds = DyeScore(os.path.join(asset_dir, 'valid_config.yaml'))
+    assert ds.config('INPUT_PARQUET_LOCATION') == 's3://inputdir'
+    assert ds.config('DYESCORE_DATA_DIR') == 's3://outputdir'
+    assert ds.config('AWS_ACCESS_KEY_ID') == 'jgdflkgjsld;gs'
+    assert ds.config('AWS_SECRET_ACCESS_KEY') == 'dsil;guewort9q9vkdf/'
+
+
+def test_config_with_use_aws_true(asset_dir):
+    ds = DyeScore(os.path.join(asset_dir, 'valid_config.yaml'))
+    assert ds.config('USE_AWS') is True
+
+
+def test_config_with_use_aws_false(asset_dir):
+    ds = DyeScore(os.path.join(asset_dir, 'valid_config_aws_false.yaml'))
+    assert ds.config('USE_AWS') is False