зеркало из https://github.com/mozilla/dye-score.git
Bulk add library
This commit is contained in:
Родитель
01a81c2fe6
Коммит
180d0d9aa0
|
@ -0,0 +1,21 @@
|
|||
# http://editorconfig.org
|
||||
|
||||
root = true
|
||||
|
||||
[*]
|
||||
indent_style = space
|
||||
indent_size = 4
|
||||
trim_trailing_whitespace = true
|
||||
insert_final_newline = true
|
||||
charset = utf-8
|
||||
end_of_line = lf
|
||||
|
||||
[*.bat]
|
||||
indent_style = tab
|
||||
end_of_line = crlf
|
||||
|
||||
[LICENSE]
|
||||
insert_final_newline = false
|
||||
|
||||
[Makefile]
|
||||
indent_style = tab
|
|
@ -0,0 +1,15 @@
|
|||
* Dye Score version:
|
||||
* Python version:
|
||||
* Operating System:
|
||||
|
||||
### Description
|
||||
|
||||
Describe what you were trying to get done.
|
||||
Tell us what happened, what went wrong, and what you expected to happen.
|
||||
|
||||
### What I Did
|
||||
|
||||
```
|
||||
Paste the command(s) you ran and the output.
|
||||
If there was a crash, please include the traceback here.
|
||||
```
|
|
@ -0,0 +1,102 @@
|
|||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
env/
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# dotenv
|
||||
.env
|
||||
|
||||
# virtualenv
|
||||
.venv
|
||||
venv/
|
||||
ENV/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
|
@ -0,0 +1,17 @@
|
|||
# Config file for automatic testing at travis-ci.org
|
||||
|
||||
language: python
|
||||
python:
|
||||
- 3.7
|
||||
- 3.6
|
||||
- 3.5
|
||||
- 3.4
|
||||
|
||||
install:
|
||||
- pip install -r requirements-dev.txt
|
||||
- python setup.py install
|
||||
|
||||
script:
|
||||
- py.test
|
||||
|
||||
|
|
@ -0,0 +1,128 @@
|
|||
.. highlight:: shell
|
||||
|
||||
============
|
||||
Contributing
|
||||
============
|
||||
|
||||
Contributions are welcome, and they are greatly appreciated! Every little bit
|
||||
helps, and credit will always be given.
|
||||
|
||||
You can contribute in many ways:
|
||||
|
||||
Types of Contributions
|
||||
----------------------
|
||||
|
||||
Report Bugs
|
||||
~~~~~~~~~~~
|
||||
|
||||
Report bugs at https://github.com/birdsarah/dye_score/issues.
|
||||
|
||||
If you are reporting a bug, please include:
|
||||
|
||||
* Your operating system name and version.
|
||||
* Any details about your local setup that might be helpful in troubleshooting.
|
||||
* Detailed steps to reproduce the bug.
|
||||
|
||||
Fix Bugs
|
||||
~~~~~~~~
|
||||
|
||||
Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
|
||||
wanted" is open to whoever wants to implement it.
|
||||
|
||||
Implement Features
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Look through the GitHub issues for features. Anything tagged with "enhancement"
|
||||
and "help wanted" is open to whoever wants to implement it.
|
||||
|
||||
Write Documentation
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Dye Score could always use more documentation, whether as part of the
|
||||
official Dye Score docs, in docstrings, or even on the web in blog posts,
|
||||
articles, and such.
|
||||
|
||||
Submit Feedback
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
The best way to send feedback is to file an issue at https://github.com/birdsarah/dye_score/issues.
|
||||
|
||||
If you are proposing a feature:
|
||||
|
||||
* Explain in detail how it would work.
|
||||
* Keep the scope as narrow as possible, to make it easier to implement.
|
||||
* Remember that this is a volunteer-driven project, and that contributions
|
||||
are welcome :)
|
||||
|
||||
Get Started!
|
||||
------------
|
||||
|
||||
Ready to contribute? Here's how to set up `dye_score` for local development.
|
||||
|
||||
1. Fork the `dye_score` repo on GitHub.
|
||||
2. Clone your fork locally::
|
||||
|
||||
$ git clone git@github.com:your_name_here/dye_score.git
|
||||
|
||||
3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
|
||||
|
||||
$ mkvirtualenv dye_score
|
||||
$ cd dye_score/
|
||||
$ python setup.py develop
|
||||
|
||||
4. Create a branch for local development::
|
||||
|
||||
$ git checkout -b name-of-your-bugfix-or-feature
|
||||
|
||||
Now you can make your changes locally.
|
||||
|
||||
5. When you're done making changes, check that your changes pass flake8 and the
|
||||
tests, including testing other Python versions with tox::
|
||||
|
||||
$ flake8 dye_score tests
|
||||
$ python setup.py test or py.test
|
||||
$ tox
|
||||
|
||||
To get flake8 and tox, just pip install them into your virtualenv.
|
||||
|
||||
6. Commit your changes and push your branch to GitHub::
|
||||
|
||||
$ git add .
|
||||
$ git commit -m "Your detailed description of your changes."
|
||||
$ git push origin name-of-your-bugfix-or-feature
|
||||
|
||||
7. Submit a pull request through the GitHub website.
|
||||
|
||||
Pull Request Guidelines
|
||||
-----------------------
|
||||
|
||||
Before you submit a pull request, check that it meets these guidelines:
|
||||
|
||||
1. The pull request should include tests.
|
||||
2. If the pull request adds functionality, the docs should be updated. Put
|
||||
your new functionality into a function with a docstring, and add the
|
||||
feature to the list in README.rst.
|
||||
3. The pull request should work for Python 2.7, 3.4, 3.5 and 3.6, and for PyPy. Check
|
||||
https://travis-ci.org/birdsarah/dye_score/pull_requests
|
||||
and make sure that the tests pass for all supported Python versions.
|
||||
|
||||
Tips
|
||||
----
|
||||
|
||||
To run a subset of tests::
|
||||
|
||||
$ py.test tests.test_dye_score
|
||||
|
||||
|
||||
Deploying
|
||||
---------
|
||||
|
||||
A reminder for the maintainers on how to deploy.
|
||||
Make sure all your changes are committed (including an entry in HISTORY.rst).
|
||||
Then run::
|
||||
|
||||
$ bumpversion patch # possible: major / minor / patch
|
||||
$ git push
|
||||
$ git push --tags
|
||||
|
||||
Travis will then deploy to PyPI if tests pass.
|
|
@ -0,0 +1,10 @@
|
|||
include CONTRIBUTING.rst
|
||||
include HISTORY.rst
|
||||
include LICENSE
|
||||
include README.rst
|
||||
|
||||
recursive-include tests *
|
||||
recursive-exclude * __pycache__
|
||||
recursive-exclude * *.py[co]
|
||||
|
||||
recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
|
|
@ -0,0 +1,81 @@
|
|||
.PHONY: clean clean-test clean-pyc clean-build docs help
|
||||
.DEFAULT_GOAL := help
|
||||
|
||||
define BROWSER_PYSCRIPT
|
||||
import os, webbrowser, sys
|
||||
|
||||
from urllib.request import pathname2url
|
||||
|
||||
webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
|
||||
endef
|
||||
export BROWSER_PYSCRIPT
|
||||
|
||||
define PRINT_HELP_PYSCRIPT
|
||||
import re, sys
|
||||
|
||||
for line in sys.stdin:
|
||||
match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
|
||||
if match:
|
||||
target, help = match.groups()
|
||||
print("%-20s %s" % (target, help))
|
||||
endef
|
||||
export PRINT_HELP_PYSCRIPT
|
||||
|
||||
BROWSER := python -c "$$BROWSER_PYSCRIPT"
|
||||
|
||||
help:
|
||||
@python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
|
||||
|
||||
clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
|
||||
|
||||
clean-build: ## remove build artifacts
|
||||
rm -fr build/
|
||||
rm -fr dist/
|
||||
rm -fr .eggs/
|
||||
find . -name '*.egg-info' -exec rm -fr {} +
|
||||
find . -name '*.egg' -exec rm -f {} +
|
||||
|
||||
clean-pyc: ## remove Python file artifacts
|
||||
find . -name '*.pyc' -exec rm -f {} +
|
||||
find . -name '*.pyo' -exec rm -f {} +
|
||||
find . -name '*~' -exec rm -f {} +
|
||||
find . -name '__pycache__' -exec rm -fr {} +
|
||||
|
||||
clean-test: ## remove test and coverage artifacts
|
||||
rm -f .coverage
|
||||
rm -fr htmlcov/
|
||||
rm -fr .pytest_cache
|
||||
|
||||
lint: ## check style with flake8
|
||||
flake8 dye_score tests
|
||||
|
||||
test: ## run tests quickly with the default Python
|
||||
py.test
|
||||
|
||||
coverage: ## check code coverage quickly with the default Python
|
||||
coverage run --source dye_score -m pytest
|
||||
coverage report -m
|
||||
coverage html
|
||||
$(BROWSER) htmlcov/index.html
|
||||
|
||||
docs: ## generate Sphinx HTML documentation, including API docs
|
||||
rm -f docs/dye_score.rst
|
||||
rm -f docs/modules.rst
|
||||
sphinx-apidoc -o docs/ dye_score
|
||||
$(MAKE) -C docs clean
|
||||
$(MAKE) -C docs html
|
||||
$(BROWSER) docs/_build/html/index.html
|
||||
|
||||
servedocs: docs ## compile the docs watching for changes
|
||||
watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
|
||||
|
||||
release: dist ## package and upload a release
|
||||
twine upload dist/*
|
||||
|
||||
dist: clean ## builds source and wheel package
|
||||
python setup.py sdist
|
||||
python setup.py bdist_wheel
|
||||
ls -l dist
|
||||
|
||||
install: clean ## install the package to the active Python's site-packages
|
||||
python setup.py install
|
|
@ -1,2 +0,0 @@
|
|||
# dye-score
|
||||
Utilities to build the dye-score metric from OpenWPM javascript call data.
|
|
@ -0,0 +1,8 @@
|
|||
=========
|
||||
Dye Score
|
||||
=========
|
||||
|
||||
Utilities to build the dye-score metric from OpenWPM_ javascript call data.
|
||||
|
||||
|
||||
.. _OpenWPM: https://github.com/mozilla/openwpm
|
|
@ -0,0 +1,20 @@
|
|||
# Minimal makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line.
|
||||
SPHINXOPTS =
|
||||
SPHINXBUILD = python -msphinx
|
||||
SPHINXPROJ = dye_score
|
||||
SOURCEDIR = .
|
||||
BUILDDIR = _build
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
|
@ -0,0 +1,159 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# dye_score documentation build configuration file, created by
|
||||
# sphinx-quickstart on Fri Jun 9 13:47:02 2017.
|
||||
#
|
||||
# This file is execfile()d with the current directory set to its
|
||||
# containing dir.
|
||||
#
|
||||
# Note that not all possible configuration values are present in this
|
||||
# autogenerated file.
|
||||
#
|
||||
# All configuration values have a default; values that are commented out
|
||||
# serve to show the default.
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another
|
||||
# directory, add these directories to sys.path here. If the directory is
|
||||
# relative to the documentation root, use os.path.abspath to make it
|
||||
# absolute, like shown here.
|
||||
#
|
||||
import os
|
||||
import sys
|
||||
sys.path.insert(0, os.path.abspath('..'))
|
||||
|
||||
import sphinx_rtd_theme
|
||||
import dye_score
|
||||
|
||||
# -- General configuration ---------------------------------------------
|
||||
|
||||
# If your documentation needs a minimal Sphinx version, state it here.
|
||||
#
|
||||
# needs_sphinx = '1.0'
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
|
||||
extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
||||
# The suffix(es) of source filenames.
|
||||
# You can specify multiple suffix as a list of string:
|
||||
#
|
||||
# source_suffix = ['.rst', '.md']
|
||||
source_suffix = '.rst'
|
||||
|
||||
# The master toctree document.
|
||||
master_doc = 'index'
|
||||
|
||||
# General information about the project.
|
||||
project = u'Dye Score'
|
||||
copyright = u"2019, Sarah Bird"
|
||||
author = u"Sarah Bird"
|
||||
|
||||
# The version info for the project you're documenting, acts as replacement
|
||||
# for |version| and |release|, also used in various other places throughout
|
||||
# the built documents.
|
||||
#
|
||||
# The short X.Y version.
|
||||
version = dye_score.__version__
|
||||
# The full version, including alpha/beta/rc tags.
|
||||
release = dye_score.__version__
|
||||
|
||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||
# for a list of supported languages.
|
||||
#
|
||||
# This is also used if you do content translation via gettext catalogs.
|
||||
# Usually you set "language" from the command line for these cases.
|
||||
language = None
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This patterns also effect to html_static_path and html_extra_path
|
||||
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
|
||||
|
||||
# The name of the Pygments (syntax highlighting) style to use.
|
||||
pygments_style = 'sphinx'
|
||||
|
||||
# If true, `todo` and `todoList` produce output, else they produce nothing.
|
||||
todo_include_todos = False
|
||||
|
||||
|
||||
# -- Options for HTML output -------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
|
||||
# Theme options are theme-specific and customize the look and feel of a
|
||||
# theme further. For a list of options available for each theme, see the
|
||||
# documentation.
|
||||
#
|
||||
# html_theme_options = {}
|
||||
|
||||
|
||||
# -- Options for HTMLHelp output ---------------------------------------
|
||||
|
||||
# Output file base name for HTML help builder.
|
||||
htmlhelp_basename = 'dye_scoredoc'
|
||||
|
||||
|
||||
# -- Options for LaTeX output ------------------------------------------
|
||||
|
||||
latex_elements = {
|
||||
# The paper size ('letterpaper' or 'a4paper').
|
||||
#
|
||||
# 'papersize': 'letterpaper',
|
||||
|
||||
# The font size ('10pt', '11pt' or '12pt').
|
||||
#
|
||||
# 'pointsize': '10pt',
|
||||
|
||||
# Additional stuff for the LaTeX preamble.
|
||||
#
|
||||
# 'preamble': '',
|
||||
|
||||
# Latex figure (float) alignment
|
||||
#
|
||||
# 'figure_align': 'htbp',
|
||||
}
|
||||
|
||||
# Grouping the document tree into LaTeX files. List of tuples
|
||||
# (source start file, target name, title, author, documentclass
|
||||
# [howto, manual, or own class]).
|
||||
latex_documents = [
|
||||
(master_doc, 'dye_score.tex',
|
||||
u'Dye Score Documentation',
|
||||
u'Sarah Bird', 'manual'),
|
||||
]
|
||||
|
||||
|
||||
# -- Options for manual page output ------------------------------------
|
||||
|
||||
# One entry per manual page. List of tuples
|
||||
# (source start file, name, description, authors, manual section).
|
||||
man_pages = [
|
||||
(master_doc, 'dye_score',
|
||||
u'Dye Score Documentation',
|
||||
[author], 1)
|
||||
]
|
||||
|
||||
|
||||
# -- Options for Texinfo output ----------------------------------------
|
||||
|
||||
# Grouping the document tree into Texinfo files. List of tuples
|
||||
# (source start file, target name, title, author,
|
||||
# dir menu entry, description, category)
|
||||
texinfo_documents = [
|
||||
(master_doc, 'dye_score',
|
||||
u'Dye Score Documentation',
|
||||
author,
|
||||
'dye_score',
|
||||
'One line description of project.',
|
||||
'Miscellaneous'),
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -0,0 +1 @@
|
|||
.. include:: ../CONTRIBUTING.rst
|
|
@ -0,0 +1,7 @@
|
|||
API Reference
|
||||
=============
|
||||
|
||||
.. automodule:: dye_score
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
|
@ -0,0 +1,18 @@
|
|||
Welcome to Dye Score's documentation!
|
||||
======================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Contents:
|
||||
|
||||
readme
|
||||
installation
|
||||
usage
|
||||
modules
|
||||
contributing
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
* :ref:`genindex`
|
||||
* :ref:`modindex`
|
||||
* :ref:`search`
|
|
@ -0,0 +1,49 @@
|
|||
.. highlight:: shell
|
||||
|
||||
============
|
||||
Installation
|
||||
============
|
||||
|
||||
Prerequisites
|
||||
-------------
|
||||
|
||||
You will need `Apache Spark`_ available on your system. PySpark will be
|
||||
installed when you install dye_score.
|
||||
|
||||
|
||||
Stable release
|
||||
--------------
|
||||
|
||||
To install Dye Score, run this command in your terminal:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ pip install dye_score
|
||||
|
||||
From sources
|
||||
------------
|
||||
|
||||
The sources for Dye Score can be downloaded from the `Github repo`_.
|
||||
|
||||
You can either clone the public repository:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ git clone git://github.com/mozilla/dye_score
|
||||
|
||||
Or download the `tarball`_:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ curl -OL https://github.com/mozilla/dye_score/tarball/master
|
||||
|
||||
Once you have a copy of the source, you can install it with:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ python setup.py install
|
||||
|
||||
|
||||
.. _Apache spark: https://spark.apache.org/downloads.html
|
||||
.. _Github repo: https://github.com/mozilla/dye_score
|
||||
.. _tarball: https://github.com/mozilla/dye_score/tarball/master
|
|
@ -0,0 +1,36 @@
|
|||
@ECHO OFF
|
||||
|
||||
pushd %~dp0
|
||||
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
if "%SPHINXBUILD%" == "" (
|
||||
set SPHINXBUILD=python -msphinx
|
||||
)
|
||||
set SOURCEDIR=.
|
||||
set BUILDDIR=_build
|
||||
set SPHINXPROJ=dye_score
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
if errorlevel 9009 (
|
||||
echo.
|
||||
echo.The Sphinx module was not found. Make sure you have Sphinx installed,
|
||||
echo.then set the SPHINXBUILD environment variable to point to the full
|
||||
echo.path of the 'sphinx-build' executable. Alternatively you may add the
|
||||
echo.Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.http://sphinx-doc.org/
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
|
||||
goto end
|
||||
|
||||
:help
|
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
|
||||
|
||||
:end
|
||||
popd
|
|
@ -0,0 +1,7 @@
|
|||
dye_score
|
||||
=========
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
dye_score
|
|
@ -0,0 +1 @@
|
|||
.. include:: ../README.rst
|
|
@ -0,0 +1,7 @@
|
|||
=====
|
||||
Usage
|
||||
=====
|
||||
|
||||
To use Dye Score in a project::
|
||||
|
||||
import dye_score
|
|
@ -0,0 +1,9 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from .dye_score import DyeScore
|
||||
|
||||
__author__ = """Sarah Bird"""
|
||||
__email__ = 'fx-data-dev@mozilla.org'
|
||||
__version__ = '0.1.0'
|
||||
|
||||
__all__ = ['DyeScore']
|
|
@ -0,0 +1,8 @@
|
|||
import numpy as np
|
||||
|
||||
|
||||
def get_chebyshev_distances_xarray_ufunc(df_array, df_dye_array):
|
||||
def chebyshev(x):
|
||||
return np.abs(df_array[:, 0, :] - x).max(axis=1)
|
||||
result = np.apply_along_axis(chebyshev, 1, df_dye_array).T
|
||||
return result
|
|
@ -0,0 +1,407 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import numpy as np
|
||||
import shutil
|
||||
import yaml
|
||||
|
||||
from dask.dataframe import (
|
||||
from_pandas,
|
||||
read_csv,
|
||||
read_parquet,
|
||||
)
|
||||
from pprint import pprint
|
||||
from xarray import (
|
||||
apply_ufunc,
|
||||
DataArray,
|
||||
open_zarr,
|
||||
)
|
||||
try:
|
||||
from pyspark.sql.functions import udf
|
||||
except ModuleNotFoundError:
|
||||
print('PySpark not available for data processing.')
|
||||
|
||||
from .distances import (
|
||||
get_chebyshev_distances_xarray_ufunc,
|
||||
)
|
||||
from .utils import (
|
||||
get_netloc,
|
||||
get_path,
|
||||
get_end_of_path,
|
||||
get_clean_script,
|
||||
)
|
||||
|
||||
|
||||
def get_raw_snippet_from_row(row):
|
||||
script_url = row.script_url
|
||||
func_name = row.func_name
|
||||
if script_url == '':
|
||||
script_url = row.top_level_url
|
||||
netloc = get_netloc(script_url)
|
||||
path = get_path(script_url)
|
||||
path_end = get_end_of_path(path)
|
||||
return netloc + '||' + path_end + '||' + func_name
|
||||
|
||||
|
||||
class DyeScore:
|
||||
__conf = {
|
||||
"INPUT_PARQUET_LOCATION": "",
|
||||
"DYESCORE_DATA_DIR": "",
|
||||
"DYESCORE_RESULTS_DIR": "",
|
||||
"USE_AWS": False,
|
||||
"AWS_ACCESS_KEY_ID": "",
|
||||
"AWS_SECRET_ACCESS_KEY": "",
|
||||
}
|
||||
|
||||
dye_score_columns = [
|
||||
'top_level_url',
|
||||
'script_url',
|
||||
'func_name',
|
||||
'symbol',
|
||||
]
|
||||
|
||||
dye_score_files = {
|
||||
'raw_snippet_call_df': 'raw_snippet_call_df.parquet',
|
||||
'raw_snippet_to_snippet_lookup': 'snippet_lookup.parquet',
|
||||
'snippets': 'snippets.zarr',
|
||||
'snippet_dyeing_map': 'snippet_dyeing_map.parquet',
|
||||
}
|
||||
|
||||
@property
|
||||
def s3_storage_options(self):
|
||||
if self.config('USE_AWS') is True:
|
||||
return dict(
|
||||
anon=False,
|
||||
key=self.config('AWS_ACCESS_KEY_ID'),
|
||||
secret=self.config('AWS_SECRET_ACCESS_KEY')
|
||||
)
|
||||
else:
|
||||
return None
|
||||
|
||||
@property
|
||||
def to_parquet_opts(self):
|
||||
return dict(
|
||||
compression='snappy', engine='pyarrow', storage_options=self.s3_storage_options
|
||||
)
|
||||
|
||||
def __init__(self, config_file_path, validate_config=True):
|
||||
"""Sets up dye score config used to interact with your environment.
|
||||
|
||||
Holds references to file paths and private data such as AWS API keys.
|
||||
|
||||
Expects a YAML file with the following keys:
|
||||
* INPUT_PARQUET_LOCATION - the location of the raw or sampled OpenWPM input parquet folder
|
||||
* DYESCORE_DATA_DIR - location where you would like dye score to store data assets
|
||||
* DYESCORE_RESULTS_DIR - location where you would like dye score to store results assets
|
||||
* USE_AWS - default False - set true if data store is AWS
|
||||
* AWS_ACCESS_KEY_ID - optional - for storing and retrieving data on AWS
|
||||
* AWS_SECRET_ACCESS_KEY - optional - for storing and retrieving data on AWS
|
||||
|
||||
Locations can be a local file path or a bucket.
|
||||
"""
|
||||
if not os.path.exists(config_file_path):
|
||||
raise ValueError(f'config_file_path `{config_file_path}` not found')
|
||||
|
||||
with open(config_file_path, 'r') as f:
|
||||
config = yaml.safe_load(f.read())
|
||||
|
||||
self.__conf['INPUT_PARQUET_LOCATION'] = config['INPUT_PARQUET_LOCATION']
|
||||
self.__conf['DYESCORE_DATA_DIR'] = config['DYESCORE_DATA_DIR']
|
||||
use_aws = config.get('USE_AWS', False)
|
||||
self.__conf['USE_AWS'] = bool(use_aws)
|
||||
self.__conf['AWS_ACCESS_KEY_ID'] = config.get('AWS_ACCESS_KEY_ID', '')
|
||||
self.__conf['AWS_SECRET_ACCESS_KEY'] = config.get('AWS_SECRET_ACCESS_KEY', '')
|
||||
pprint(DyeScore.__conf)
|
||||
if validate_config is True:
|
||||
self.validate_config()
|
||||
|
||||
def config(self, option):
|
||||
"""Used by dye score methods to retrieve config options"""
|
||||
return self.__conf[option]
|
||||
|
||||
def dye_score_data_file(self, filename):
|
||||
dyescoredir = self.config('DYESCORE_DATA_DIR')
|
||||
path = os.path.join(dyescoredir, self.dye_score_files[filename])
|
||||
return path
|
||||
|
||||
def validate_config(self):
|
||||
if self.config('USE_AWS') is True:
|
||||
assert self.config('INPUT_PARQUET_LOCATION').startswith('s3://')
|
||||
assert self.config('DYESCORE_DATA_DIR').startswith('s3://')
|
||||
|
||||
def validate_input_data(self):
|
||||
in_file = self.config('INPUT_PARQUET_LOCATION')
|
||||
df = read_parquet(in_file, engine='pyarrow')
|
||||
for column in self.dye_score_columns:
|
||||
assert column in df.columns, f'{column} missing from df.columns ({df.columns})'
|
||||
assert df[column].dtype == 'object', f'{column} does not have dtype `object`'
|
||||
return True
|
||||
|
||||
def get_input_df(self, columns=None):
|
||||
if not columns:
|
||||
columns = self.dye_score_columns
|
||||
in_file = self.config('INPUT_PARQUET_LOCATION')
|
||||
df = read_parquet(in_file, columns=columns, engine='pyarrow')
|
||||
return df
|
||||
|
||||
##
|
||||
# DATA PROCESSING
|
||||
#
|
||||
# In the next few methods we switch between dask and spark. We switch to spark
|
||||
# whenever we need to leverage its superior performance handling strings.
|
||||
##
|
||||
|
||||
@staticmethod
|
||||
def file_in_validation(inpath):
|
||||
if not os.path.exists(inpath):
|
||||
raise ValueError(f'File {inpath} does not exist. Cannot proceed.')
|
||||
|
||||
@staticmethod
|
||||
def file_out_validation(outpath, override):
|
||||
if os.path.exists(outpath) and override is False:
|
||||
raise ValueError(f'File {outpath} already exists. Use `override=True` to remove and replace.')
|
||||
if os.path.exists(outpath) and override is True:
|
||||
print(f'Removing existing file {outpath}')
|
||||
shutil.rmtree(outpath)
|
||||
|
||||
def build_raw_snippet_df(self, override=False):
|
||||
"""Builds raw_snippets from input data
|
||||
|
||||
Snippet function is ``script_url.netloc||script_url.path_end||func_name``
|
||||
If script_url is missing, location is used.
|
||||
|
||||
Args:
|
||||
override (bool): True to replace any existing outputs
|
||||
|
||||
Returns:
|
||||
str. The file path where output is saved
|
||||
"""
|
||||
# TODO Add add an issue to supply user generated snippet code
|
||||
|
||||
# File setup
|
||||
inpath = self.config('INPUT_PARQUET_LOCATION')
|
||||
outpath = self.dye_score_data_file('raw_snippet_call_df')
|
||||
self.file_in_validation(inpath)
|
||||
self.file_out_validation(outpath, override)
|
||||
# Process
|
||||
df = read_parquet(inpath, columns=self.dye_score_columns, engine='pyarrow')
|
||||
df['raw_snippet'] = df.apply(get_raw_snippet_from_row, axis=1, meta='O')
|
||||
df['called'] = 1
|
||||
print(df.head())
|
||||
df.to_parquet(outpath, **self.to_parquet_opts)
|
||||
return outpath
|
||||
|
||||
def build_snippet_map(self, override=False):
|
||||
"""Builds snippet ids and saves map of ids to raw snippets
|
||||
|
||||
xarray cannot handle arbitrary length string indexes so we need to build a set of unique
|
||||
ids to reference snippets. This method creates the ids and saves the map of raw ids to snippets.
|
||||
|
||||
Args:
|
||||
override (bool): True to replace any existing outputs
|
||||
|
||||
Returns:
|
||||
str. The file path where output is saved
|
||||
"""
|
||||
# TODO File an issue - do we have a problem with duplicate snippets?
|
||||
|
||||
# File setup
|
||||
inpath = self.dye_score_data_file('raw_snippet_call_df')
|
||||
outpath = self.dye_score_data_file('raw_snippet_to_snippet_lookup')
|
||||
self.file_in_validation(inpath)
|
||||
self.file_out_validation(outpath, override)
|
||||
# Process
|
||||
df = read_parquet(inpath, columns=['raw_snippet'], engine='pyarrow')
|
||||
snippet_lookup = df.raw_snippet.unique().to_frame()
|
||||
snippet_lookup['snippet'] = snippet_lookup.raw_snippet.apply(lambda x: hash(x), meta='int64')
|
||||
print(snippet_lookup.head())
|
||||
snippet_lookup.to_parquet(outpath, **self.to_parquet_opts)
|
||||
return outpath
|
||||
|
||||
def _load_and_join_raw_data_to_snippets(self, spark, columns=[], override=False):
|
||||
# File setup
|
||||
snippet_map = self.dye_score_data_file('raw_snippet_to_snippet_lookup')
|
||||
inpath = self.dye_score_data_file('raw_snippet_call_df')
|
||||
self.file_in_validation(snippet_map)
|
||||
self.file_in_validation(inpath)
|
||||
|
||||
# Process - pivot with spark and save to tmp file
|
||||
df_map = spark.read.parquet(snippet_map).select(['raw_snippet', 'snippet'])
|
||||
df = spark.read.parquet(inpath)
|
||||
if columns:
|
||||
df = df.select(columns)
|
||||
joined = df.join(df_map, on='raw_snippet')
|
||||
joined = joined.drop('raw_snippet')
|
||||
return joined
|
||||
|
||||
def build_snippets(self, spark, override=False):
|
||||
"""Builds row-normalized snippet dataset
|
||||
|
||||
Dimensions are n snippets x s unique symbols in dataset.
|
||||
|
||||
Data is output in zarr format with processing by spark, dask, and xarray.
|
||||
|
||||
Creates an intermediate tmp file when converting from spark to dask.
|
||||
|
||||
Args:
|
||||
spark (pyspark.sql.session.SparkSession): spark instance
|
||||
override (bool): True to replace any existing outputs
|
||||
|
||||
Returns:
|
||||
str. The file path where output is saved
|
||||
"""
|
||||
# TODO Get an issue to run everything on S3
|
||||
|
||||
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
|
||||
|
||||
# File setup
|
||||
outpath = self.dye_score_data_file('snippets')
|
||||
self.file_out_validation(outpath, override)
|
||||
|
||||
# Process - pivot with spark and save to tmp file
|
||||
df_to_pivot = self._load_and_join_raw_data_to_snippets(
|
||||
spark, columns=['symbol', 'called', 'raw_snippet'], override=override
|
||||
)
|
||||
symbols = df_to_pivot.select('symbol').distinct().toPandas()
|
||||
symbols = sorted(list(symbols.symbol.values))
|
||||
print(f'Dataset has {len(symbols)} unique symbols')
|
||||
pivot = df_to_pivot.groupBy('snippet').pivot('symbol', symbols).sum('called')
|
||||
pivot = pivot.na.fill(0)
|
||||
|
||||
tmp = 'tmp.csv'
|
||||
if os.path.exists(tmp):
|
||||
shutil.rmtree(tmp)
|
||||
pivot.write.csv(tmp, header=True)
|
||||
|
||||
# Process - set_index, normalize and save to zarr
|
||||
dtypes = {symbol: 'float64' for symbol in symbols}
|
||||
dtypes['snippet'] = 'object'
|
||||
pivot_table = read_csv(f'{tmp}/*.csv', dtype=dtypes)
|
||||
pivot_table = pivot_table.set_index('snippet')
|
||||
|
||||
row_normalize = pivot_table.div(pivot_table.sum(axis=1), axis=0)
|
||||
row_normalize_array = DataArray(
|
||||
row_normalize,
|
||||
dims=['snippet', 'symbol'],
|
||||
coords={
|
||||
'snippet': row_normalize.index.values,
|
||||
'symbol': row_normalize.columns
|
||||
}
|
||||
)
|
||||
print(row_normalize_array)
|
||||
row_normalize_array.to_dataset(name='data').to_zarr(store=outpath)
|
||||
# Cleanup
|
||||
shutil.rmtree(tmp)
|
||||
return outpath
|
||||
|
||||
def build_snippet_snippet_dyeing_map(self, spark, override=False):
|
||||
"""Build file used to join snippets to data for dyeing.
|
||||
|
||||
Adds clean_script field to dataset. Saves parquet file with:
|
||||
* snippet - the int version, not raw_snippet
|
||||
* top_level_url
|
||||
* script_url
|
||||
* clean_script
|
||||
|
||||
Args:
|
||||
spark (pyspark.sql.session.SparkSession): spark instance
|
||||
override (bool): True to replace any existing outputs
|
||||
|
||||
Returns:
|
||||
str. The file path where output is saved
|
||||
|
||||
"""
|
||||
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
|
||||
|
||||
# File setup
|
||||
outpath = self.dye_score_data_file('snippet_dyeing_map')
|
||||
self.file_out_validation(outpath, override)
|
||||
|
||||
# Process
|
||||
df = self._load_and_join_raw_data_to_snippets(
|
||||
spark, columns=['top_level_url', 'script_url', 'func_name', 'raw_snippet'], override=override
|
||||
)
|
||||
get_clean_script_udf = udf(get_clean_script)
|
||||
df = df.withColumn('clean_script', get_clean_script_udf(df.script_url))
|
||||
df = df.dropDuplicates()
|
||||
df.write.parquet(outpath, compression='snappy')
|
||||
return outpath
|
||||
|
||||
##
|
||||
# Dyeing and Scoring
|
||||
##
|
||||
|
||||
def compute_distances_for_dye_snippets(self, dye_snippets, filename_suffix='dye_snippets', override=False):
|
||||
# File setup
|
||||
snippet_file = self.dye_score_data_file('snippets')
|
||||
self.file_in_validation(snippet_file)
|
||||
resultsdir = self.config('DYESCORE_RESULTS_DIR')
|
||||
file_name = f'snippets_dye_distances_from_{filename_suffix}'
|
||||
outpath = os.path.join(resultsdir, file_name)
|
||||
self.file_out_validation(outpath, override)
|
||||
|
||||
# Process distances
|
||||
df = open_zarr(store=snippet_file)['data']
|
||||
df = df.chunk({'symbol': -1})
|
||||
df_c = df.chunk({'snippet': 10_000})
|
||||
|
||||
df_dye = df.loc[{'snippet': dye_snippets}]
|
||||
df_dye = df_dye.rename({'snippet': 'dye_snippet'})
|
||||
df_dye_c = df_dye.chunk({'dye_snippet': 100})
|
||||
|
||||
distance_array = apply_ufunc(
|
||||
get_chebyshev_distances_xarray_ufunc,
|
||||
df_c, df_dye_c,
|
||||
dask='parallelized',
|
||||
output_dtypes=[float],
|
||||
input_core_dims=[['symbol'], ['symbol']],
|
||||
)
|
||||
print(distance_array)
|
||||
distance_array.to_dataset(name='data').to_zarr(store=outpath)
|
||||
return outpath
|
||||
|
||||
def compute_snippets_scores_for_thresholds(self, thresholds, filename_suffix='dye_snippets', override=False):
|
||||
resultsdir = self.config('DYESCORE_RESULTS_DIR')
|
||||
file_name = f'snippets_dye_distances_from_{filename_suffix}'
|
||||
inpath = os.path.join(resultsdir, file_name)
|
||||
self.file_in_validation(inpath)
|
||||
distance_array = open_zarr(store=inpath)['data']
|
||||
|
||||
# TODO Make issue to not hard code this
|
||||
LEAKY_THRESHOLD = 0.2
|
||||
n_sites = distance_array.shape[0]
|
||||
N_LEAKY_THRESHOLD = LEAKY_THRESHOLD * n_sites
|
||||
|
||||
outpaths = []
|
||||
for threshold in thresholds:
|
||||
n_to_dye = np.sum(distance_array < threshold, axis=0).persist()
|
||||
non_leaky_sites = n_to_dye[n_to_dye < N_LEAKY_THRESHOLD].coords.to_index()
|
||||
distance_array_filtered = distance_array.loc[{'dye_snippet': non_leaky_sites}]
|
||||
|
||||
site_counts = np.sum(distance_array_filtered < threshold, axis=1)
|
||||
site_counts_df = site_counts.to_dataframe()
|
||||
site_counts_df = site_counts_df.reset_index().rename(columns={'data': 'dye_count'})
|
||||
site_counts_df['snippet'] = site_counts_df.snippet.astype(int)
|
||||
outpath = os.path.join(resultsdir, f'snippets_score_from_{filename_suffix}_{threshold}')
|
||||
self.file_out_validation(outpath, override)
|
||||
from_pandas(site_counts_df, npartitions=1).to_parquet(outpath, **self.to_parquet_opts)
|
||||
outpaths.append(outpath)
|
||||
return outpaths
|
||||
|
||||
def compute_dye_score_for_thresholds(self, thresholds, filename_suffix='dye_snippets', override=False):
|
||||
snippet_dyeing_map_file = self.dye_score_data_file('snippet_dyeing_map')
|
||||
snippet_data = read_parquet(snippet_dyeing_map_file, engine='pyarrow')
|
||||
resultsdir = self.config('DYESCORE_RESULTS_DIR')
|
||||
|
||||
outpaths = []
|
||||
for threshold in thresholds:
|
||||
inpath = os.path.join(resultsdir, f'snippets_score_from_{filename_suffix}_{threshold}')
|
||||
outpath = os.path.join(resultsdir, f'dye_score_from_{filename_suffix}_{threshold}.csv.gz')
|
||||
self.file_out_validation(outpath, override)
|
||||
|
||||
site_counts_df = read_parquet(inpath)
|
||||
script_to_dye = snippet_data.merge(site_counts_df, on='snippet')
|
||||
script_to_dye_max = script_to_dye[['clean_script', 'dye_count']].groupby('clean_script').max()
|
||||
script_to_dye_max = script_to_dye_max.rename(columns={'dye_count': 'dye_score'})
|
||||
script_to_dye_max.to_csv(outpath, compression='gzip')
|
||||
outpaths.append(outpath)
|
||||
return outpaths
|
|
@ -0,0 +1,36 @@
|
|||
from urllib.parse import urlparse
|
||||
|
||||
EMPTY_STRING = 'EMPTY_STRING'
|
||||
|
||||
|
||||
def get_netloc(x):
|
||||
p = urlparse(x)
|
||||
val = p.netloc
|
||||
if len(val) == 0:
|
||||
val = EMPTY_STRING
|
||||
return val
|
||||
|
||||
|
||||
def get_path(x):
|
||||
p = urlparse(x)
|
||||
val = p.path
|
||||
if len(val) == 0:
|
||||
val = EMPTY_STRING
|
||||
return val
|
||||
|
||||
|
||||
def get_end_of_path(x):
|
||||
splits = x.split('/')
|
||||
val = ''
|
||||
if len(splits) > 0:
|
||||
val = splits[-1]
|
||||
else:
|
||||
val = x
|
||||
if len(val) == 0:
|
||||
val = EMPTY_STRING
|
||||
return val
|
||||
|
||||
|
||||
def get_clean_script(x):
|
||||
p = urlparse(x)
|
||||
return f'{p.netloc}{p.path}'
|
|
@ -0,0 +1,10 @@
|
|||
bumpversion==0.5.3
|
||||
coverage==4.5.1
|
||||
flake8==3.5.0
|
||||
pytest==3.8.2
|
||||
pytest-runner==4.2
|
||||
Sphinx==1.8.1
|
||||
sphinx_rtd_theme==0.4.3
|
||||
twine==1.12.1
|
||||
watchdog==0.9.0
|
||||
wheel==0.32.1
|
|
@ -0,0 +1,26 @@
|
|||
[bumpversion]
|
||||
current_version = 0.1.0
|
||||
commit = True
|
||||
tag = True
|
||||
|
||||
[bumpversion:file:setup.py]
|
||||
search = version='{current_version}'
|
||||
replace = version='{new_version}'
|
||||
|
||||
[bumpversion:file:dye_score/__init__.py]
|
||||
search = __version__ = '{current_version}'
|
||||
replace = __version__ = '{new_version}'
|
||||
|
||||
[bdist_wheel]
|
||||
universal = 1
|
||||
|
||||
[flake8]
|
||||
exclude = docs
|
||||
|
||||
[aliases]
|
||||
# Define setup.py command aliases here
|
||||
test = pytest
|
||||
|
||||
[tool:pytest]
|
||||
collect_ignore = ['setup.py']
|
||||
addopts = --disable-warnings
|
|
@ -0,0 +1,50 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""The setup script."""
|
||||
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
with open('README.rst') as readme_file:
|
||||
readme = readme_file.read()
|
||||
|
||||
requirements = [
|
||||
'pyyaml==3.13',
|
||||
'dask[complete]==1.1.5',
|
||||
'pyarrow==0.12.1',
|
||||
'pyspark==2.4.0',
|
||||
'xarray==0.12.0',
|
||||
'zarr==2.2.0',
|
||||
]
|
||||
|
||||
setup_requirements = ['pytest-runner', ]
|
||||
|
||||
test_requirements = ['pytest', ]
|
||||
|
||||
setup(
|
||||
author="Sarah Bird",
|
||||
author_email='fx-data-dev@mozilla.org',
|
||||
classifiers=[
|
||||
'Development Status :: 2 - Pre-Alpha',
|
||||
'Intended Audience :: Researchers',
|
||||
'Natural Language :: English',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.4',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
],
|
||||
description="Utilities to build the dye-score metric from OpenWPM javascript call data.",
|
||||
install_requires=requirements,
|
||||
long_description=readme,
|
||||
include_package_data=True,
|
||||
keywords='dye_score',
|
||||
name='dye_score',
|
||||
packages=find_packages(include=['dye_score']),
|
||||
setup_requires=setup_requirements,
|
||||
test_suite='tests',
|
||||
tests_require=test_requirements,
|
||||
url='https://github.com/mozilla/dye-score',
|
||||
version='0.1.0',
|
||||
zip_safe=False,
|
||||
)
|
|
@ -0,0 +1,3 @@
|
|||
AWS_ACCSES_KEY_ID:
|
||||
|
||||
jgdflkgjsld;gs
|
|
@ -0,0 +1,5 @@
|
|||
INPUT_PARQUET_LOCATION: s3://inputdir
|
||||
DYESCORE_DATA_DIR: s3://outputdir
|
||||
USE_AWS: True
|
||||
AWS_ACCESS_KEY_ID: jgdflkgjsld;gs
|
||||
AWS_SECRET_ACCESS_KEY: dsil;guewort9q9vkdf/
|
|
@ -0,0 +1,4 @@
|
|||
INPUT_PARQUET_LOCATION: inputdir
|
||||
DYESCORE_DATA_DIR: outputdir
|
||||
USE_AWS: False
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
# -*- coding: utf-8 -*
|
||||
|
||||
import os
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def asset_dir():
|
||||
test_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
return os.path.join(test_dir, 'assets')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_config():
|
||||
return {
|
||||
"INPUT_PARQUET_LOCATION": "",
|
||||
"DYESCORE_DATA_DIR": "",
|
||||
"USE_AWS": False,
|
||||
"AWS_ACCESS_KEY_ID": "",
|
||||
"AWS_SECRET_ACCESS_KEY": "",
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
import dask.dataframe as dd
|
||||
import os
|
||||
import pandas as pd
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
from pyarrow.lib import ArrowIOError
|
||||
from yaml.scanner import ScannerError
|
||||
|
||||
from dye_score import DyeScore
|
||||
|
||||
|
||||
##
|
||||
# Test Data Validation
|
||||
##
|
||||
|
||||
def test_data_validation_with_invalid_file(tmpdir, sample_config):
|
||||
# Set-up invalid data file and save config
|
||||
data_dir = os.path.join(tmpdir, 'data.csv')
|
||||
config_file = os.path.join(tmpdir, 'config.yaml')
|
||||
df = pd.DataFrame({'a': [1, 2, 3]})
|
||||
df.to_csv(data_dir)
|
||||
sample_config['INPUT_PARQUET_LOCATION'] = data_dir
|
||||
with open(config_file, 'w') as f:
|
||||
f.write(yaml.dump(sample_config))
|
||||
ds = DyeScore(config_file)
|
||||
# Test
|
||||
with pytest.raises(ArrowIOError):
|
||||
ds.validate_input_data()
|
||||
|
||||
|
||||
def test_data_validation_with_valid_file(tmpdir, sample_config):
|
||||
# Set-up valid data file and save config
|
||||
data_dir = os.path.join(tmpdir, 'data.parquet')
|
||||
config_file = os.path.join(tmpdir, 'config.yaml')
|
||||
df = pd.DataFrame({
|
||||
'top_level_url': ['a', 'b'],
|
||||
'script_url': ['c', 'd'],
|
||||
'symbol': ['e', 'f'],
|
||||
'func_name': ['g', 'h']
|
||||
})
|
||||
dfd = dd.from_pandas(df, npartitions=2)
|
||||
dfd.to_parquet(data_dir)
|
||||
sample_config['INPUT_PARQUET_LOCATION'] = data_dir
|
||||
with open(config_file, 'w') as f:
|
||||
f.write(yaml.dump(sample_config))
|
||||
ds = DyeScore(config_file)
|
||||
# Test
|
||||
assert ds.validate_input_data() is True
|
||||
|
||||
|
||||
##
|
||||
# Test Config
|
||||
##
|
||||
|
||||
def test_config_requires_valid_file():
|
||||
with pytest.raises(ValueError):
|
||||
DyeScore('test.yaml')
|
||||
|
||||
|
||||
def test_config_requires_valid_yaml(asset_dir):
|
||||
with pytest.raises(ScannerError):
|
||||
DyeScore(os.path.join(asset_dir, 'invalid_config.yaml'))
|
||||
|
||||
|
||||
def test_config_sets_properties(asset_dir):
|
||||
ds = DyeScore(os.path.join(asset_dir, 'valid_config.yaml'))
|
||||
assert ds.config('INPUT_PARQUET_LOCATION') == 's3://inputdir'
|
||||
assert ds.config('DYESCORE_DATA_DIR') == 's3://outputdir'
|
||||
assert ds.config('AWS_ACCESS_KEY_ID') == 'jgdflkgjsld;gs'
|
||||
assert ds.config('AWS_SECRET_ACCESS_KEY') == 'dsil;guewort9q9vkdf/'
|
||||
|
||||
|
||||
def test_config_with_use_aws_true(asset_dir):
|
||||
ds = DyeScore(os.path.join(asset_dir, 'valid_config.yaml'))
|
||||
assert ds.config('USE_AWS') is True
|
||||
|
||||
|
||||
def test_config_with_use_aws_false(asset_dir):
|
||||
ds = DyeScore(os.path.join(asset_dir, 'valid_config_aws_false.yaml'))
|
||||
assert ds.config('USE_AWS') is False
|
Загрузка…
Ссылка в новой задаче