Initial commit

2019-09-18 22:31:14 -07:00 · 2019-09-18 22:31:14 -07:00 · e792e1caea
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,7 @@
+*.csv
+*.pkl
+*.hdf5
+resources/
+!resources/README.md
+!tests/data/
+
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -0,0 +1,76 @@
+name: Smoke Test
+on: push
+
+# split into two jobs so it runs in parallel, even if a little redundant
+jobs:
+  docker_build:
+    name: Build Test Container
+    runs-on: ubuntu-latest
+    steps:
+
+    - name: Copy Repo Files
+      uses: actions/checkout@master
+      
+    - name: docker build
+      run: | 
+        echo ${INPUT_PASSWORD} | docker login -u ${INPUT_USERNAME} --password-stdin
+        cd $GITHUB_WORKSPACE
+        docker pull github/csnet-smoketest
+        docker build --cache-from github/csnet-smoketest -t github/csnet-smoketest -f docker/docker-cpu.Dockerfile .
+        docker push github/csnet-smoketest
+      env:
+        INPUT_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
+        INPUT_USERNAME: ${{ secrets.DOCKER_USERNAME }}
+   
+  basic_tests:
+    needs: docker_build
+    name: Integration Test Default Parameters
+    runs-on: ubuntu-latest
+    
+    steps:
+    - name: mypy type checking
+      run: |
+        cd $GITHUB_WORKSPACE
+        docker run github/csnet-smoketest mypy --ignore-missing-imports --follow-imports skip /src/train.py /src/model_test.py
+      
+    - name: neuralbow, all languages
+      run: |
+        cd $GITHUB_WORKSPACE
+        docker run github/csnet-smoketest python train.py /src /tests/data/data_train.txt /tests/data/data_train.txt /tests/data/data_train.txt --dryrun --max-num-epochs 1 --model neuralbow
+      
+    - name: --max-files-per-dir 2
+      run: |
+        cd $GITHUB_WORKSPACE 
+        docker run github/csnet-smoketest python train.py /src /tests/data/data_train.txt /tests/data/data_train.txt /tests/data/data_train.txt --dryrun --max-num-epochs 1 --max-files-per-dir 2
+  
+  CNN:
+    needs: docker_build
+    name: 1DCNN
+    runs-on: ubuntu-latest
+    
+    steps:
+    - name: 1dcnn, all languages
+      run: |
+        cd $GITHUB_WORKSPACE
+        docker run github/csnet-smoketest python train.py /src /tests/data/data_train.txt /tests/data/data_train.txt /tests/data/data_train.txt --dryrun --max-num-epochs 1 --model 1dcnn
+
+  selfattn:
+    needs: docker_build
+    name: selfattn
+    runs-on: ubuntu-latest
+    steps:
+
+    - name: selfattn, all languages
+      run: |
+        cd $GITHUB_WORKSPACE
+        docker run github/csnet-smoketest python train.py /src /tests/data/data_train.txt /tests/data/data_train.txt /tests/data/data_train.txt --dryrun --max-num-epochs 1 --model selfatt --hypers-override "{\"batch_size\":64}"
+
+  rnn:
+    needs: docker_build
+    name: rnn
+    runs-on: ubuntu-latest
+    steps:
+    - name: rnn, all languages
+      run: |
+        cd $GITHUB_WORKSPACE
+        docker run github/csnet-smoketest python train.py /src /tests/data/data_train.txt /tests/data/data_train.txt /tests/data/data_train.txt --dryrun --max-num-epochs 1 --model rnn
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,27 @@
+# ts
+**/node_modules/
+/webroot/scripts/*.js
+
+# vim
+**/*.swp
+
+# python
+**/*.pyc
+**/__pycache__/
+
+# jupyter
+**/.ipynb_checkpoints/
+
+# data
+resources/
+!resources/README.md
+!tests/data/
+*.csv
+
+# environment
+*.ftpconfig
+
+.idea
+/src/wandb/run-*
+/src/wandb/debug.log
+*.html
--- a/BENCHMARK.md
+++ b/BENCHMARK.md
@ -0,0 +1,80 @@
+## Submitting runs to the benchmark
+
+The Weights & Biases (W&B) benchmark  tracks and compares models trained on the CodeSearchNet dataset by the global machine learning research community. Anyone is welcome to submit their results for review.
+
+## Submission process
+
+### Requirements
+
+There are a few requirements for submitting a model to the benchmark.
+- You must a have a run logged to [W&B](https://app.wandb.ai)
+- Your run must have attached inference results in a file named  `model_predictions.csv`. You can view all the files attached to a given run in the browser by clicking the "Files" icon from that run's main page. 
+- The schema outlined in the submission format section below must be strictly followed. 
+
+### Submission format
+
+A valid submission to the CodeSeachNet Challenge requires a file named **model_predictions.csv** with the following fields: `query`, `language`, `identifier`, and `url`:
+
+* `query`: the textual representation of the query, e.g. "int to string" .  
+* `language`: the programming language for the given query, e.g. "python".  This information is available as a field in the data to be scored.
+* `identifier`: this is an optional field that can help you track your data
+* `url`: the unique GitHub URL to the returned results, e.g. "https://github.com/JamesClonk/vultr/blob/fed59ad207c9bda0a5dfe4d18de53ccbb3d80c91/cmd/commands.go#L12-L190" . This information is available as a field in the data to be scored.
+      
+For further background and instructions on the submission process, see the root README.
+
+The row order corresponds to the result ranking in the search task. For example, if in row 5 there is an entry for the Python query "read properties file", and in row 60 another result for the Python query "read properties file", then the URL in row 5 is considered to be ranked higher than the URL in row 60 for that query and language.
+
+The script we used to create the baseline submission is [src/predict.py](src/predict.py).  You are not required to use this script to produce your submission file -- we only provide it for reference.
+
+Here is an example: 
+
+| query                 | language | identifier                        | url                                                                                                                                                   |
+| --------------------- | -------- | --------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
+| convert int to string | python   | int_to_decimal_str                | https://github.com/raphaelm/python-sepaxml/blob/187b699b1673c862002b2bae7e1bd62fe8623aec/sepaxml/utils.py#L64-L76                                     |
+| convert int to string | python   | str_to_int_array                  | https://github.com/UCSBarchlab/PyRTL/blob/0988e5c9c10ededd5e1f58d5306603f9edf4b3e2/pyrtl/rtllib/libutils.py#L23-L33                                   |
+| convert int to string | python   | Bcp47LanguageParser.IntStr26ToInt | https://github.com/google/transitfeed/blob/eb2991a3747ba541b2cb66502b305b6304a1f85f/extensions/googletransit/pybcp47/bcp47languageparser.py#L138-L139 |
+| convert int to string | python   | PrimaryEqualProof.to_str_dict     | https://github.com/hyperledger-archives/indy-anoncreds/blob/9d9cda3d505c312257d99a13d74d8f05dac3091a/anoncreds/protocol/types.py#L604-L613            |
+| convert int to string | python   | to_int                            | https://github.com/mfussenegger/cr8/blob/a37d6049f1f9fee2d0556efae2b7b7f8761bffe8/cr8/cli.py#L8-L23                                                   |
+| how to read .csv file in an efficient way? | ruby | Icosmith.Font.generate_scss                | https://github.com/tulios/icosmith-rails/blob/e73c11eaa593fcb6f9ba93d34fbdbfe131693af4/lib/icosmith-rails/font.rb#L80-L88             |
+| how to read .csv file in an efficient way? | ruby | WebSocket.Extensions.valid_frame_rsv       | https://github.com/faye/websocket-extensions-ruby/blob/1a441fac807e08597ec4b315d4022aea716f3efc/lib/websocket/extensions.rb#L120-L134 |
+| how to read .csv file in an efficient way? | ruby | APNS.Pem.read_file_at_path                 | https://github.com/jrbeck/mercurius/blob/1580a4af841a6f30ac62f87739fdff87e9608682/lib/mercurius/apns/pem.rb#L12-L18                   |
+
+
+
+### Submitting model predictions to W&B 
+
+You can submit your results to the benchmark as follows:
+
+1. Run a training job with any script (your own or the baseline example provided, with or without W&B logging).
+2. Generate your own file of model predictions following the format above and name it \`model_predictions.csv\`.
+3. Upload a run to wandb with this \`model_predictions.csv\` file attached.
+
+Our example script [src/predict.py](src/predict.py) takes care of steps 2 and 3 for a model whose training run has been logged to W&B, given the corresponding W&B run id, which you can find on the /overview page in the browser or by clicking the 'info' icon on a given run.
+
+Here is a short example script that will create a run in W&B and perform the upload (step 3) for a local file of predictions:
+```python
+import wandb
+wandb.init(project="codesearchnet", resume="must")
+wandb.save('model_predictions.csv')
+```
+
+### Publishing your submission
+
+You've now generated all the content required to submit a run to the CodeSearchNet benchmark. Using the W&B GitHub integration you can now submit your model for review via the web app.
+
+You can submit your runs by visiting the run page and clicking on the overview tab:
+![](https://github.com/wandb/core/blob/master/frontends/app/src/assets/run-page-benchmark.png?raw=true)
+
+or by selecting a run from the runs table:
+![](https://app.wandb.ai/static/media/submit_benchmark_run.e286da0d.png)
+
+### Result evaluation
+
+Once you upload your \`model_predictions.csv\` file, W&B will compute the normalized cumulative gain (NCG) of your model's predictions against the human-annotated relevance scores.  Further details on the evaluation process and metrics are in the root README. For transparency, we include the script used to evaluate submissions: [src/relevanceeval.py](src/relevanceeval.py)
+
+
+### Training the baseline model (optional)
+
+Replicating our results for the CodeSearchNet baseline is optional, as we encourage the community to create their own models and methods for ranking search results.  To replicate our baseline submission, you can start with the instructions in the [CodeSearchNet GitHub repository](https://github.com/ml-msr-github/CodeSearchNet).  This baseline model uses [src/predict.py](src/predict.py) to generate the submission file.
+
+Your run will be logged to W&B, within a project that will be automatically linked to this benchmark.
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -0,0 +1,76 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at opensource@github.com. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,47 @@
+## Contributing
+
+[fork]: https://help.github.com/articles/fork-a-repo/
+[pr]: https://help.github.com/articles/creating-a-pull-request/
+[style]: https://www.python.org/dev/peps/pep-0008/
+[code-of-conduct]: CODE_OF_CONDUCT.md
+[azurepipelines]: azure-pipelines.yml
+[benchmark]: BENCHMARK.md
+
+Hi there! We're thrilled that you'd like to contribute to this project. Your help is essential for keeping it great.
+
+Contributions to this project are [released](https://help.github.com/articles/github-terms-of-service/#6-contributions-under-repository-license) to the public under the [project's open source license](LICENSE).
+
+Please note that this project is released with a [Contributor Code of Conduct][code-of-conduct]. By participating in this project you agree to abide by its terms.
+
+## Scope
+
+We anticipate that the community will design custom architectures and use frameworks other than Tensorflow.  Furthermore, we anticipate that other datasets beyond the ones provided in this project might be useful.  It is not our intention to integrate the best models and datasets into this repository as a superset of all available ideas.  Rather, we intend to provide baseline approaches and a central place of reference with links to related repositories from the community.  Therefore, we are accepting pull requests for the following items:
+
+- Bug fixes
+- Updates to documentation, including links to your project(s) where improvements to the baseline have been made
+- Minor improvements to the code
+
+Please open an issue if you are unsure regarding the best course of action.  
+
+## Submitting a pull request
+
+0. [Fork][fork] and clone the repository
+0. Configure and install the dependencies: `script/bootstrap`
+0. Make sure the tests pass on your machine: see [azure-pipelines.yml][azurepipelines] to see tests we are currently running.
+0. Create a new branch: `git checkout -b my-branch-name`
+0. Make your change, add tests, and make sure the tests still pass.
+0. Push to your fork and [submit a pull request][pr]
+0. Pat your self on the back and wait for your pull request to be reviewed and merged.
+
+Here are a few things you can do that will increase the likelihood of your pull request being accepted:
+
+- Follow the [style guide][style].
+- Write tests.
+- Keep your change as focused as possible. If there are multiple changes you would like to make that are not dependent upon each other, consider submitting them as separate pull requests.
+- Write a [good commit message](http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html).
+
+## Resources
+
+- [How to Contribute to Open Source](https://opensource.guide/how-to-contribute/)
+- [Using Pull Requests](https://help.github.com/articles/about-pull-requests/)
+- [GitHub Help](https://help.github.com)
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 GitHub
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,350 @@
+![Tests](https://github.com/github/CodeSearchNet/workflows/Smoke%20Test/badge.svg)
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)  [![Python 3.6](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/)
+[![Weights-And-Biases](https://img.shields.io/badge/Weights%20&%20Biases-black.svg?logo=google-analytics)](https://app.wandb.ai/github/codesearchnet/benchmark)
+
+**TODO: Update Paper Link **
+
+[paper]: https://ml4code.github.io/
+
+**Table of Contents**
+
+<!-- TOC depthFrom:1 depthTo:6 withLinks:1 updateOnSave:1 orderedList:0 -->
+
+- [Introduction](#introduction)
+	- [Project Overview](#project-overview)
+	- [Data](#data)
+	- [Evaluation](#evaluation)
+		- [Annotations](#annotations)
+	- [Setup](#setup)
+- [Data Details](#data-details)
+	- [Data Acquisition](#data-acquisition)
+	- [Schema & Format](#schema-format)
+	- [Downloading Data from S3](#downloading-data-from-s3)
+- [Running our Baseline Model](#running-our-baseline-model)
+	- [Quickstart](#quickstart)
+	- [Model Architecture](#model-architecture)
+	- [Training](#training)
+- [References](#references)
+	- [Benchmark](#benchmark)
+	- [How to Contribute](#how-to-contribute)
+	- [Other READMEs](#other-readmes)
+	- [W&B Setup](#wb-setup)
+	- [Licenses](#licenses)
+
+<!-- /TOC -->
+
+# QuickStart: Training Baseline Models
+
+Want to jump right into training our baseline model?  Head [here](#quickstart).
+
+# Introduction
+
+## Project Overview
+
+  [CodeSearchNet][paper] is a collection of datasets and benchmarks that explore the problem of code retrieval using natural language.  This research is a continuation of some ideas presented in this [blog post](https://githubengineering.com/towards-natural-language-semantic-code-search/) and is a joint collaboration between GitHub and the [Deep Program Understanding](https://www.microsoft.com/en-us/research/project/program/) group at [Microsoft Research - Cambridge](https://www.microsoft.com/en-us/research/lab/microsoft-research-cambridge/).  Our intent is to present and provide a platform for this research to the community by providing the following:
+
+  1. Instructions for obtaining large corpora of relevant data
+  2. Open source code for a range of baseline models, along with pre-trained weights
+  3. Baseline evaluation metrics and utilities.
+  4. Mechanisms to track progress on a [shared community benchmark](https://app.wandb.ai/github/codesearchnet/benchmark), hosted by [Weights & Biases](https://www.wandb.com/)
+
+We hope that CodeSearchNet is a step towards engaging with the broader machine learning and NLP community regarding the relationship between source code and natural language. We describe a specific task here, but we expect and welcome other uses of our dataset.
+
+More context regarding the motivation for this problem is in [this paper][paper].
+
+## Data
+
+  The primary dataset consists of 2 Million (`comment`, `code`) pairs from open source libraries.  Concretely, a `comment` is a top-level function or method comment (e.g. [docstrings](https://en.wikipedia.org/wiki/Docstring) in Python), and `code` is an entire function or method. Currently, the dataset contains Python, Javascript, Ruby, Go, Java, and PHP code.  Throughout this repo, we refer to the terms docstring and query interchangeably.  We partition the data into train, validation, and test splits such that code from the same repository can only exist in one partition. Currently this is the only dataset on which we train our model. Summary stastics about this dataset can be found in [this notebook](notebooks/ExploreData.ipynb)
+
+  For more information about how to obtain the data, see [this section](#data-details).
+
+## Evaluation
+
+  The metric we use for evaluation is [Normalized Discounted Cumalitive Gain](https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG).  Please reference [this paper][paper] for further details regarding model evaluation.
+
+### Annotations
+
+  We manually annotated retrieval results for the six languages from 99 general [queries](resources/queries.csv). This dataset is used as groundtruth data for evaluation _only_. Please reference [this paper][paper] for further details on the annotation process.
+
+
+## Setup
+
+  You should only have to perform the setup steps once to download the data and prepare the environment.
+
+  1. Due to the complexity of installing all dependencies, we prepared Docker containers to run this code. You can find instructions on how to install Docker in the [official docs](https://docs.docker.com/get-started/).  Additionally, you must install [Nvidia-Docker](https://github.com/NVIDIA/nvidia-docker) to satisfy GPU-compute related dependencies.  For those who are new to Docker, this [blog post](https://towardsdatascience.com/how-docker-can-help-you-become-a-more-effective-data-scientist-7fc048ef91d5) provides a gentle introduction focused on data science.
+
+  2. After installing Docker, you need to download the pre-processed datasets, which are hosted on S3.  You can do this by running `script/setup`.
+      ```
+      script/setup
+      ```
+      This will build Docker containers and download the datasets.  By default, the data is downloaded into the `resources/data/` folder inside this repository, with the directory structure described [here](resources/README.md).
+
+  **The datasets you will download (most of them compressed) have a combined size of only ~ 3.5 GB.** 
+
+  For more about the data, see [Data Details](#data-details) below as well as [this notebook](notebooks/ExploreData.ipynb).
+
+
+# Data Details
+
+## Data Acquisition
+
+If you have run the [setup steps](#setup) above you will already have the data, and nothing more needs to be done. The data will be available in the `/resources/data` folder of this repository, with [this directory structure](/resources/README.md).
+
+## Schema & Format
+
+Data is stored in [jsonlines](http://jsonlines.org/) format.  Each line in the uncompressed file represents one example (usually a function with an associated comment). A prettified example of one row is illustrated below.
+
+- **repo:** the owner/repo
+- **path:** the full path to the original file
+- **func_name:** the function or method name
+- **original_string:** the raw string before tokenization or parsing
+- **language:** the programming language
+- **code:** the part of the `original_string` that is code
+- **code_tokens:** tokenized version of `code`
+- **docstring:** the top level comment or docstring, if exists in the original string
+- **docstring_tokens:** tokenized version of `docstring`
+- **sha:** this field is not being used [TODO: add note on where this comes from?]
+- **partition:** a flag indicating what partition this datum belongs to of {train, valid, test, etc.} This is not used by the model.  Instead we rely on directory structure to denote the partition of the data.
+- **url:** the url for the this code snippet including the line numbers
+
+Code, comments, and docstrings are extracted in a language-specific manner, removing artifacts of that language.
+
+```{json}
+{
+  'code': 'def get_vid_from_url(url):\n'
+          '        """Extracts video ID from URL.\n'
+          '        """\n'
+          "        return match1(url, r'youtu\\.be/([^?/]+)') or \\\n"
+          "          match1(url, r'youtube\\.com/embed/([^/?]+)') or \\\n"
+          "          match1(url, r'youtube\\.com/v/([^/?]+)') or \\\n"
+          "          match1(url, r'youtube\\.com/watch/([^/?]+)') or \\\n"
+          "          parse_query_param(url, 'v') or \\\n"
+          "          parse_query_param(parse_query_param(url, 'u'), 'v')",
+  'code_tokens': ['def',
+                  'get_vid_from_url',
+                  '(',
+                  'url',
+                  ')',
+                  ':',
+                  'return',
+                  'match1',
+                  '(',
+                  'url',
+                  ',',
+                  "r'youtu\\.be/([^?/]+)'",
+                  ')',
+                  'or',
+                  'match1',
+                  '(',
+                  'url',
+                  ',',
+                  "r'youtube\\.com/embed/([^/?]+)'",
+                  ')',
+                  'or',
+                  'match1',
+                  '(',
+                  'url',
+                  ',',
+                  "r'youtube\\.com/v/([^/?]+)'",
+                  ')',
+                  'or',
+                  'match1',
+                  '(',
+                  'url',
+                  ',',
+                  "r'youtube\\.com/watch/([^/?]+)'",
+                  ')',
+                  'or',
+                  'parse_query_param',
+                  '(',
+                  'url',
+                  ',',
+                  "'v'",
+                  ')',
+                  'or',
+                  'parse_query_param',
+                  '(',
+                  'parse_query_param',
+                  '(',
+                  'url',
+                  ',',
+                  "'u'",
+                  ')',
+                  ',',
+                  "'v'",
+                  ')'],
+  'docstring': 'Extracts video ID from URL.',
+  'docstring_tokens': ['Extracts', 'video', 'ID', 'from', 'URL', '.'],
+  'func_name': 'YouTube.get_vid_from_url',
+  'language': 'python',
+  'original_string': 'def get_vid_from_url(url):\n'
+                      '        """Extracts video ID from URL.\n'
+                      '        """\n'
+                      "        return match1(url, r'youtu\\.be/([^?/]+)') or \\\n"
+                      "          match1(url, r'youtube\\.com/embed/([^/?]+)') or "
+                      '\\\n'
+                      "          match1(url, r'youtube\\.com/v/([^/?]+)') or \\\n"
+                      "          match1(url, r'youtube\\.com/watch/([^/?]+)') or "
+                      '\\\n'
+                      "          parse_query_param(url, 'v') or \\\n"
+                      "          parse_query_param(parse_query_param(url, 'u'), "
+                      "'v')",
+  'partition': 'test',
+  'path': 'src/you_get/extractors/youtube.py',
+  'repo': 'soimort/you-get',
+  'sha': 'b746ac01c9f39de94cac2d56f665285b0523b974',
+  'url': 'https://github.com/soimort/you-get/blob/b746ac01c9f39de94cac2d56f665285b0523b974/src/you_get/extractors/youtube.py#L135-L143'
+}
+```
+
+Furthermore, summary statistics such as row counts and token length histograms can be found in [this notebook](notebooks/ExploreData.ipynb)
+
+## Downloading Data from S3
+
+The shell script `/script/setup` will automatically download these files into the `/resources/data` directory.  Here are the links to the relevant files for visibility:
+
+The s3 links follow this pattern:
+
+> https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/{python,java,go,php,javascript,ruby}.zip
+
+For example, the link for the `java` is:
+
+> https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip
+
+The size of the dataset is approximately 20 GB.  The various files and the directory structure are explained [here](resources/README.md).
+
+
+# Running our Baseline Model
+
+Warning: the scripts provided to reproduce our baseline model take more than 24 hours on an [AWS P3-V100](https://aws.amazon.com/ec2/instance-types/p3/) instance.
+
+## Quickstart
+
+Make sure you have [Docker](https://docs.docker.com/get-started/) and [Nvidia-Docker](https://github.com/NVIDIA/nvidia-docker) (for GPU-compute related dependencies) installed. You should only have to perform the setup steps once to prepare the environment and download the data.
+
+  ```bash
+  # clone this repository
+  git clone https://github.com/ml-msr-github/CodeSearchNet.git
+  # download data (~3.5GB) from S3; build and run Docker container
+  # (this will land you inside the Docker container, starting in the /src directory--you can detach from/attach to this container to pause/continue your work)
+  cd CodeSearchNet/
+  script/setup
+  # optional: log in to W&B to see your training metrics, track your experiments, and submit your models to the community benchmark
+  wandb login
+  # verify your setup by training a tiny model
+  python train.py --testrun
+  # see other command line options, try a full training run, and explore other model variants by extending this baseline training script example
+  python train.py --help
+  python train.py
+  ```
+
+Once you're satisfied with a new model, test it against the CodeSearchNet Challenge. This will generate a CSV file of model prediction scores which you can then submit to the Weights & Biases [community benchmark](https://app.wandb.ai/github/codesearchnet/benchmark) by [following these instructions](src/docs/BENCHMARK.md).
+
+  ```bash
+  python predict.py [-r | --wandb_run_id] github/codesearchnet/0123456
+  # or
+  python predict.py [-m | --model_file] ../resources/saved_models/*.pkl.gz
+  ```
+
+## Model Architecture
+
+  Our baseline models ingest a parallel corpus of (`comments`, `code`) and learn to retrieve a code snippet given a natural language query.  Specifically, `comments` are top-level function and method comments (e.g. docstrings in Python), and `code` is an entire function or method. Throughout this repo, we refer to the terms docstring and query interchangeably.
+
+  The query has a single encoder, whereas each programming language has its own encoder. The available encoders are Neural-Bag-Of-Words, RNN, 1D-CNN, Self-Attention (BERT), and a 1D-CNN+Self-Attention Hybrid.
+
+  The diagram below illustrates the general architecture of our baseline models:
+  
+  ![alt text](images/architecture.png "Architecture")
+
+## Training
+
+This step assumes that you have a suitable Nvidia-GPU with [Cuda v9.0](https://developer.nvidia.com/cuda-90-download-archive) installed.  We used [AWS P3-V100](https://aws.amazon.com/ec2/instance-types/p3/) instances (a `p3.2xlarge` is sufficient).
+
+  1. Start the model run environment by running `script/console`:
+      ```
+      script/console
+      ```
+      This will drop you into the shell of a Docker container with all necessary dependencies installed, including the code in this repository, along with data that you downloaded in the previous step.  By default you will be placed in the `src/` folder of this GitHub repository.  From here you can execute commands to run the model.
+
+  2. Set up [W&B](https://docs.wandb.com/docs/started.html) (free for open source projects) per the instructions below if you would like to share your results on the community benchmark.  This is optional but highly recommended.
+
+  3. The entry point to this model is `src/train.py`.  You can see various options by executing the following command:
+      ```
+      python train.py --help
+      ```
+      To test if everything is working on a small dataset, you can run the following command:
+      ```
+      python train.py --testrun
+      ```
+
+  4. Now you are prepared for a full training run.  Example commands to kick off training runs:
+  * Training a neural-bag-of-words model on all languages
+      ```
+      python train.py --model neuralbow
+      ```
+
+    The above command will assume default values for the location(s) of the training data and a destination where would like to save the output model.  The default location for training data is specified in `/src/data_dirs_{train,valid,test}.txt`.  These files each contain a list of paths where data for the corresponding partition exists. If more than one path specified (separated by a newline), the data from all the paths will be concatenated together.  For example, this is the content of `src/data_dirs_train.txt`:
+
+    ```
+    $ cat data_dirs_train.txt
+    ../resources/data/python/final/jsonl/train
+    ../resources/data/javascript/final/jsonl/train
+    ../resources/data/java/final/jsonl/train
+    ../resources/data/php/final/jsonl/train
+    ../resources/data/ruby/final/jsonl/train
+    ../resources/data/go/final/jsonl/train
+    ```
+
+    By default models are saved in the `resources/saved_models` folder of this repository.
+
+  * Training a 1D-CNN model on Python data only:
+    ```
+    python train.py --model 1dcnn /trained_models ../resources/data/python/final/jsonl/train ../resources/data/python/final/jsonl/valid ../resources/data/python/final/jsonl/test
+    ```
+
+    The above command overrides the default locations for saving the model to `trained_models` and also overrides the source of the train, validation, and test sets.
+
+Additional notes:
+* Options for `--model` are currently listed in `src/model_restore_helper.get_model_class_from_name`.
+
+* Hyperparameters are specific to the respective model/encoder classes; a simple trick to discover them is to kick off a run without specifying hyperparameter choices, as that will print a list of all used hyperparameters with their default values (in JSON format).
+
+* By default, models are saved in the `/resources/saved_models` folder of this repository, but this can be overridden as shown above.
+
+
+# References
+
+## Benchmark
+
+  We are using a community benchmark for this project to encourage collaboration and improve reproducibility.  It is hosted by [Weights & Biases](https://www.wandb.com/) (W&B), which is free for open source projects.  Our entries in the benchmark link to detailed logs of our training and evaluation metrics, as well as model artifacts, and we encourage other participants to provide as much transparency as possible.
+
+  We invite the community to submit their runs to this benchmark to facilitate transperency by following [these instructions](src/docs/BENCHMARK.md).
+
+## How to Contribute
+  
+  We anticipate that the community will design custom architectures and use frameworks other than Tensorflow.  Furthermore, we anticipate that additional datasets will be useful.  It is not our intention to integrate these models, approaches, and datasets into this repository as a superset of all available ideas.  Rather, we intend to maintain the baseline models and links to the data in this repository as a central place of reference.  We are accepting PRs that update the documentation, link to your project(s) with improved benchmarks, fix bugs, or make minor improvements to the code.  Here are [more specific guidelines for contributing to this repository](CONTRIBUTING.md); note particularly our [Code of Conduct](CODE_OF_CONDUCT.md).  Please open an issue if you are unsure of the best course of action.  
+
+## Other READMEs
+
+- [Submitting to the benchmark](BENCHMARK.md)
+- [Data structure](/resources/README.md)
+
+## W&B Setup
+
+ To initialize W&B:
+
+   1. Navigate to the `/src` directory in this repository.
+
+   2. If it's your first time using W&B on a machine, you will need to login:
+
+      ```
+      $ wandb login
+      ```
+
+   3. You will be asked for your API key, which appears on your [W&B profile settings page](https://app.wandb.ai/settings).
+
+## Licenses
+
+The licenses for source code used as data for this project are provided with the [data download](#downloading-data-from-s3) for each language in `_licenses.pkl` [files](resources/README.md#directory-structure).
+
+This code and documentation for this project are released under the [MIT License](LICENSE).
--- a/docker/docker-cpu.Dockerfile
+++ b/docker/docker-cpu.Dockerfile
@ -0,0 +1,26 @@
+FROM python:3.6
+ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
+# Install Python packages
+RUN pip --no-cache-dir install --upgrade \
+    docopt \
+    dpu-utils \
+    ipdb \
+    wandb \
+    https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.12.0-cp36-cp36m-linux_x86_64.whl \
+    typed_ast \
+    more_itertools \
+    scipy \
+    toolz \
+    tqdm \
+    pandas \
+    parso \
+    pytest \
+    mypy
+
+RUN pip --no-cache-dir install --upgrade \
+    ipdb
+
+COPY src/docs/THIRD_PARTY_NOTICE.md .
+COPY . /
+WORKDIR /src
+CMD bash
--- a/docker/docker-gpu.Dockerfile
+++ b/docker/docker-gpu.Dockerfile
@ -0,0 +1,75 @@
+FROM tensorflow/tensorflow:1.12.0-gpu-py3
+ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN add-apt-repository -y ppa:git-core/ppa
+RUN add-apt-repository -y ppa:jonathonf/python-3.6
+
+RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \
+    byobu \
+    ca-certificates \
+    git-core git \
+    htop \
+    libglib2.0-0 \
+    libjpeg-dev \
+    libpng-dev \
+    libxext6 \
+    libsm6 \
+    libxrender1 \
+    libcupti-dev \
+    openssh-server \
+    python3.6 \
+    python3.6-dev \
+    software-properties-common \
+    vim \
+    unzip \
+    && \
+apt-get clean && \
+rm -rf /var/lib/apt/lists/*
+
+RUN apt-get -y update
+
+#  Setup Python 3.6 (Need for other dependencies)
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.5 1
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 2
+RUN apt-get install -y python3-setuptools
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+RUN python3 get-pip.py
+RUN pip install --upgrade pip
+
+# Pin TF Version on v1.12.0
+RUN pip --no-cache-dir install https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whl
+
+# Other python packages
+RUN pip --no-cache-dir install --upgrade \
+    altair==3.2.0 \
+    annoy==1.16.0 \
+    docopt==0.6.2 \
+    dpu_utils==0.1.34 \
+    ipdb==0.12.2 \
+    jsonpath_rw_ext==1.2.2 \
+    jupyter==1.0.0 \
+    more_itertools==7.2.0 \
+    numpy==1.17.0 \
+    pandas==0.25.0 \
+    parso==0.5.1 \
+    pygments==2.4.2 \
+    requests==2.22.0 \
+    scipy==1.3.1 \
+    SetSimilaritySearch==0.1.7 \
+    toolz==0.10.0 \
+    tqdm==4.34.0 \
+    typed_ast==1.4.0 \
+    wandb==0.8.10 \
+    wget==3.2
+
+ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
+
+# Open Ports for TensorBoard, Jupyter, and SSH
+EXPOSE 6006
+EXPOSE 7654
+EXPOSE 22
+
+WORKDIR /home/dev/src
+COPY src/docs/THIRD_PARTY_NOTICE.md .
+
+CMD bash
--- a/docker/preprocessing.Dockerfile
+++ b/docker/preprocessing.Dockerfile
@ -0,0 +1,11 @@
+FROM python:3.7.3
+ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN pip --no-cache-dir install --upgrade \
+    pip \
+    docopt \
+    pandas
+
+COPY src/docs/THIRD_PARTY_NOTICE.md .
+
+CMD ["/home/dev/script/download_and_preprocess"]
--- a/function_parser/Dockerfile
+++ b/function_parser/Dockerfile
@ -0,0 +1,34 @@
+FROM python:3.7.3
+
+RUN touch /etc/inside-container
+
+RUN set -ex && pip3 install --upgrade pip
+RUN set -ex && pip3 --no-cache-dir install --upgrade jupyter \
+    tree_sitter==0.0.5 \
+    requests \
+    pyhive \
+    tqdm \
+    pandas \
+    python-arango \
+    docopt \
+    elasticsearch \
+    dpu_utils
+
+RUN mkdir -p /src/vendor
+RUN mkdir -p /src/build
+RUN cd /src/vendor && git clone https://github.com/tree-sitter/tree-sitter-python.git
+RUN cd /src/vendor && git clone https://github.com/tree-sitter/tree-sitter-javascript.git
+RUN cd /src/vendor && git clone https://github.com/tree-sitter/tree-sitter-typescript.git
+RUN cd /src/vendor && git clone https://github.com/tree-sitter/tree-sitter-go.git
+RUN cd /src/vendor && git clone https://github.com/tree-sitter/tree-sitter-ruby.git
+RUN cd /src/vendor && git clone https://github.com/tree-sitter/tree-sitter-java.git
+RUN cd /src/vendor && git clone https://github.com/tree-sitter/tree-sitter-cpp.git
+RUN cd /src/vendor && git clone https://github.com/tree-sitter/tree-sitter-c-sharp.git
+RUN cd /src/vendor && git clone https://github.com/tree-sitter/tree-sitter-php.git
+
+COPY script/setup.py /src/function-parser/script/setup.py
+
+RUN cd /src/function-parser/script && python setup.py
+WORKDIR /src/function-parser
+
+CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--no-browser", "--allow-root", "--NotebookApp.token=''"]
--- a/function_parser/README.md
+++ b/function_parser/README.md
@ -0,0 +1,58 @@
+# Function Parser
+
+This repository contains various utils to parse GitHub repositories into function definition and docstring pairs. It is based on [tree-sitter](https://github.com/tree-sitter/) to parse code into [ASTs](https://en.wikipedia.org/wiki/Abstract_syntax_tree) and apply heuristics to parse metadata in more details. Currently, it supports 6 languages: Python, Java, Go, Php, Ruby, and Javascript.
+
+It also parses function calls and links them with their definitions for Python.
+
+## Examples
+
+Input library `keras-team/keras` is parsed into list of functions including various metadata (e.g. identifier, docstring, sha, url, etc.). Below is an example output of `Activation` function from `keras` library.
+```
+{
+    'nwo': 'keras-team/keras',
+    'sha': '0fc33feb5f4efe3bb823c57a8390f52932a966ab',
+    'path': 'keras/layers/core.py',
+    'language': 'python',
+    'identifier': 'Activation.__init__',
+    'parameters': '(self, activation, **kwargs)',
+    'argument_list': '',
+    'return_statement': '',
+    'docstring': '',
+    'function': 'def __init__(self, activation, **kwargs):\n        super(Activation, self).__init__(**kwargs)\n        self.supports_masking = True\n        self.activation = activations.get(activation)',
+    'url': 'https://github.com/keras-team/keras/blob/0fc33feb5f4efe3bb823c57a8390f52932a966ab/keras/layers/core.py#L294-L297'
+}
+```
+
+One example of `Activation` in the call sites of `eriklindernoren/Keras-GAN` repository is shown below:
+```
+{
+    'nwo': 'eriklindernoren/Keras-GAN',
+    'sha': '44d3320e84ca00071de8a5c0fb4566d10486bb1d',
+    'path': 'dcgan/dcgan.py',
+    'language': 'python',
+    'identifier': 'Activation',
+    'argument_list': '("relu")',
+    'url': 'https://github.com/eriklindernoren/Keras-GAN/blob/44d3320e84ca00071de8a5c0fb4566d10486bb1d/dcgan/dcgan.py#L61-L61'
+}
+```
+
+With an edge linking the two urls
+```
+(
+    'https://github.com/eriklindernoren/Keras-GAN/blob/44d3320e84ca00071de8a5c0fb4566d10486bb1d/dcgan/dcgan.py#L61-L61',
+    'https://github.com/keras-team/keras/blob/0fc33feb5f4efe3bb823c57a8390f52932a966ab/keras/layers/core.py#L294-L297'
+)
+```
+
+A [demo notebook](function_parser/demo.ipynb) is also provided for exploration.
+
+## Usages
+### To run the notebook on your own:
+1. `script/bootstrap` to build docker container
+2. `script/server` to run the jupyter notebook server and navigate to `function_parser/demo.ipynb`
+
+### To run the script:
+1. `script/bootstrap` to build docker container
+2. `script/setup` to download libraries.io data
+3. `script/console` to ssh into the container
+4. Inside the container, run `python function_parser/process.py --language python --processes 16 '/src/function-parser/data/libraries-1.4.0-2018-12-22/' '/src/function-parser/data/'`
--- a/function_parser/function_parser/init.py
+++ b/function_parser/function_parser/init.py
--- a/function_parser/function_parser/demo.ipynb
+++ b/function_parser/function_parser/demo.ipynb
--- a/function_parser/function_parser/fetch_licenses.py
+++ b/function_parser/function_parser/fetch_licenses.py
@ -0,0 +1,89 @@
+import glob
+from itertools import chain
+import os
+import pickle
+import re
+
+from dask.distributed import Client
+import dask.distributed
+from tqdm import tqdm
+
+from language_data import LANGUAGE_METADATA
+from utils import download
+
+# Gets notices
+LEGAL_FILES_REGEX ='(AUTHORS|NOTICE|LEGAL)(?:\..*)?\Z'
+
+PREFERRED_EXT_REGEX = '\.[md|markdown|txt|html]\Z'
+
+# Regex to match any extension except .spdx or .header
+OTHER_EXT_REGEX = '\.(?!spdx|header|gemspec)[^./]+\Z'
+
+# Regex to match, LICENSE, LICENCE, unlicense, etc.
+LICENSE_REGEX = '(un)?licen[sc]e'
+
+# Regex to match COPYING, COPYRIGHT, etc.
+COPYING_REGEX = 'copy(ing|right)'
+
+# Regex to match OFL.
+OFL_REGEX = 'ofl'
+
+# BSD + PATENTS patent file
+PATENTS_REGEX = 'patents'
+
+
+def match_license_file(filename):
+    for regex in [LEGAL_FILES_REGEX, 
+                  LICENSE_REGEX + '\Z',
+                  LICENSE_REGEX + PREFERRED_EXT_REGEX,
+                  COPYING_REGEX + '\Z',
+                  COPYING_REGEX + PREFERRED_EXT_REGEX,
+                  LICENSE_REGEX + OTHER_EXT_REGEX,
+                  COPYING_REGEX + OTHER_EXT_REGEX,
+                  LICENSE_REGEX + '[-_]',
+                  COPYING_REGEX + '[-_]',
+                  '[-_]' + LICENSE_REGEX,
+                  '[-_]' + COPYING_REGEX,
+                  OFL_REGEX + PREFERRED_EXT_REGEX,
+                  OFL_REGEX + OTHER_EXT_REGEX,
+                  OFL_REGEX + '\Z',
+                  PATENTS_REGEX + '\Z',
+                  PATENTS_REGEX + OTHER_EXT_REGEX]:
+        if re.match(regex, filename.lower()):
+            return filename
+    return None
+
+def flattenlist(listoflists):
+    return list(chain.from_iterable(listoflists))
+
+def fetch_license(nwo):
+    licenses = []
+    tmp_dir = download(nwo)
+    for f in sorted(glob.glob(tmp_dir.name + '/**/*', recursive=True), key=lambda x: len(x)):
+        if not os.path.isdir(f):
+            if match_license_file(f.split('/')[-1]):
+                licenses.append((nwo, f.replace(tmp_dir.name + '/', ''), open(f, errors='surrogateescape').read()))
+    
+    return licenses
+
+
+client = Client()
+
+for language in LANGUAGE_METADATA.keys():
+    definitions = pickle.load(open('../data/{}_dedupe_definitions_v2.pkl'.format(language), 'rb'))
+    nwos = list(set([d['nwo'] for d in definitions]))
+    
+    futures = client.map(fetch_license, nwos)
+    results = []
+    for r in tqdm(futures):
+        try:
+            results.append(r.result(2))
+        except dask.distributed.TimeoutError:
+            continue
+    
+    flat_results = flattenlist(results)
+    licenses = dict()
+    for nwo, path, content in flat_results:
+        if content:
+            licenses[nwo] = licenses.get(nwo, []) + [(path, content)]
+    pickle.dump(licenses, open('../data/{}_licenses.pkl'.format(language), 'wb'))
--- a/function_parser/function_parser/language_data.py
+++ b/function_parser/function_parser/language_data.py
@ -0,0 +1,40 @@
+from parsers.go_parser import GoParser
+from parsers.java_parser import JavaParser
+from parsers.javascript_parser import JavascriptParser
+from parsers.php_parser import PhpParser
+from parsers.python_parser import PythonParser
+from parsers.ruby_parser import RubyParser
+
+
+LANGUAGE_METADATA = {
+    'python': {
+        'platform': 'pypi',
+        'ext': 'py',
+        'language_parser': PythonParser
+    },
+    'java': {
+        'platform': 'maven',
+        'ext': 'java',
+        'language_parser': JavaParser
+    },
+    'go': {
+        'platform': 'go',
+        'ext': 'go',
+        'language_parser': GoParser
+    },
+    'javascript': {
+        'platform': 'npm',
+        'ext': 'js',
+        'language_parser': JavascriptParser
+    },
+    'php': {
+        'platform': 'packagist',
+        'ext': 'php',
+        'language_parser': PhpParser
+    },
+    'ruby': {
+        'platform': 'rubygems',
+        'ext': 'rb',
+        'language_parser': RubyParser
+    }
+}
--- a/function_parser/function_parser/parser_cli.py
+++ b/function_parser/function_parser/parser_cli.py
@ -0,0 +1,25 @@
+"""
+Usage:
+    parser_cli.py [options] INPUT_FILEPATH
+
+Options:
+    -h --help
+    --language LANGUAGE             Language
+"""
+import json
+
+from docopt import docopt
+from tree_sitter import Language
+
+from language_data import LANGUAGE_METADATA
+from process import DataProcessor
+
+if __name__ == '__main__':
+    args = docopt(__doc__)
+
+    DataProcessor.PARSER.set_language(Language('/src/build/py-tree-sitter-languages.so', args['--language']))
+    processor = DataProcessor(language=args['--language'],
+                              language_parser=LANGUAGE_METADATA[args['--language']]['language_parser'])
+
+    functions = processor.process_single_file(args['INPUT_FILEPATH'])
+    print(json.dumps(functions, indent=2))
--- a/function_parser/function_parser/parsers/init.py
+++ b/function_parser/function_parser/parsers/init.py
--- a/function_parser/function_parser/parsers/commentutils.py
+++ b/function_parser/function_parser/parsers/commentutils.py
@ -0,0 +1,24 @@
+def strip_c_style_comment_delimiters(comment: str) -> str:
+    comment_lines = comment.split('\n')
+    cleaned_lines = []
+    for l in comment_lines:
+        l = l.strip()
+        if l.endswith('*/'):
+            l = l[:-2]
+        if l.startswith('*'):
+            l = l[1:]
+        elif l.startswith('/**'):
+            l = l[3:]
+        elif l.startswith('//'):
+            l = l[2:]
+        cleaned_lines.append(l.strip())
+    return '\n'.join(cleaned_lines)
+
+
+def get_docstring_summary(docstring: str) -> str:
+    """Get the first lines of the documentation comment up to the empty lines."""
+    if '\n\n' in docstring:
+        return docstring.split('\n\n')[0]
+    elif '@' in docstring:
+        return docstring[:docstring.find('@')]  # This usually is the start of a JavaDoc-style @param comment.
+    return docstring
--- a/function_parser/function_parser/parsers/go_parser.py
+++ b/function_parser/function_parser/parsers/go_parser.py
@ -0,0 +1,53 @@
+from typing import List, Dict, Any
+
+from parsers.language_parser import LanguageParser, match_from_span, tokenize_code
+from parsers.commentutils import get_docstring_summary, strip_c_style_comment_delimiters
+
+
+class GoParser(LanguageParser):
+
+    FILTER_PATHS = ('test', 'vendor')
+
+    @staticmethod
+    def get_definition(tree, blob: str) -> List[Dict[str, Any]]:
+        definitions = []
+        comment_buffer = []
+        for child in tree.root_node.children:
+            if child.type == 'comment':
+                comment_buffer.append(child)
+            elif child.type in ('method_declaration', 'function_declaration'):
+                docstring = '\n'.join([match_from_span(comment, blob) for comment in comment_buffer])
+                docstring_summary = strip_c_style_comment_delimiters((get_docstring_summary(docstring)))
+
+                metadata = GoParser.get_function_metadata(child, blob)
+                definitions.append({
+                    'type': child.type,
+                    'identifier': metadata['identifier'],
+                    'parameters': metadata['parameters'],
+                    'function': match_from_span(child, blob),
+                    'function_tokens': tokenize_code(child, blob),
+                    'docstring': docstring,
+                    'docstring_summary': docstring_summary,
+                    'start_point': child.start_point,
+                    'end_point': child.end_point     
+                })
+                comment_buffer = []
+            else:
+                comment_buffer = []
+        return definitions
+
+
+    @staticmethod
+    def get_function_metadata(function_node, blob: str) -> Dict[str, str]:
+        metadata = {
+            'identifier': '',
+            'parameters': '',
+        }
+        if function_node.type == 'function_declaration':
+            metadata['identifier'] = match_from_span(function_node.children[1], blob)
+            metadata['parameters'] = match_from_span(function_node.children[2], blob)
+        elif function_node.type == 'method_declaration':
+            metadata['identifier'] = match_from_span(function_node.children[2], blob)
+            metadata['parameters'] = ' '.join([match_from_span(function_node.children[1], blob),
+                                               match_from_span(function_node.children[3], blob)])
+        return metadata
--- a/function_parser/function_parser/parsers/java_parser.py
+++ b/function_parser/function_parser/parsers/java_parser.py
@ -0,0 +1,89 @@
+from typing import List, Dict, Any
+
+from parsers.language_parser import LanguageParser, match_from_span, tokenize_code, traverse_type
+from parsers.commentutils import strip_c_style_comment_delimiters, get_docstring_summary
+
+
+class JavaParser(LanguageParser):
+
+    FILTER_PATHS = ('test', 'tests')
+
+    BLACKLISTED_FUNCTION_NAMES = {'toString', 'hashCode', 'equals', 'finalize', 'notify', 'notifyAll', 'clone'}
+
+    @staticmethod
+    def get_definition(tree, blob: str) -> List[Dict[str, Any]]:
+        classes = (node for node in tree.root_node.children if node.type == 'class_declaration')
+
+        definitions = []
+        for _class in classes:
+            class_identifier = match_from_span([child for child in _class.children if child.type == 'identifier'][0], blob).strip()
+            for child in (child for child in _class.children if child.type == 'class_body'):
+                for idx, node in enumerate(child.children):
+                    if node.type == 'method_declaration':
+                        if JavaParser.is_method_body_empty(node):
+                            continue
+                        docstring = ''
+                        if idx - 1 >= 0 and child.children[idx-1].type == 'comment':
+                            docstring = match_from_span(child.children[idx - 1], blob)
+                            docstring = strip_c_style_comment_delimiters(docstring)
+                        docstring_summary = get_docstring_summary(docstring)
+
+                        metadata = JavaParser.get_function_metadata(node, blob)
+                        if metadata['identifier'] in JavaParser.BLACKLISTED_FUNCTION_NAMES:
+                            continue
+                        definitions.append({
+                            'type': node.type,
+                            'identifier': '{}.{}'.format(class_identifier, metadata['identifier']),
+                            'parameters': metadata['parameters'],
+                            'function': match_from_span(node, blob),
+                            'function_tokens': tokenize_code(node, blob),
+                            'docstring': docstring,
+                            'docstring_summary': docstring_summary,
+                            'start_point': node.start_point,
+                            'end_point': node.end_point
+                        })
+        return definitions
+
+    @staticmethod
+    def get_class_metadata(class_node, blob: str) -> Dict[str, str]:
+        metadata = {
+            'identifier': '',
+            'argument_list': '',
+        }
+        is_header = False
+        for n in class_node.children:
+            if is_header:
+                if n.type == 'identifier':
+                    metadata['identifier'] = match_from_span(n, blob).strip('(:')
+                elif n.type == 'argument_list':
+                    metadata['argument_list'] = match_from_span(n, blob)
+            if n.type == 'class':
+                is_header = True
+            elif n.type == ':':
+                break
+        return metadata
+
+    @staticmethod
+    def is_method_body_empty(node):
+        for c in node.children:
+            if c.type in {'method_body', 'constructor_body'}:
+                if c.start_point[0] == c.end_point[0]:
+                    return True
+
+    @staticmethod
+    def get_function_metadata(function_node, blob: str) -> Dict[str, str]:
+        metadata = {
+            'identifier': '',
+            'parameters': '',
+        }
+
+        declarators = []
+        traverse_type(function_node, declarators, '{}_declarator'.format(function_node.type.split('_')[0]))
+        parameters = []
+        for n in declarators[0].children:
+            if n.type == 'identifier':
+                metadata['identifier'] = match_from_span(n, blob).strip('(')
+            elif n.type == 'formal_parameter':
+                parameters.append(match_from_span(n, blob))
+        metadata['parameters'] = ' '.join(parameters)
+        return metadata
--- a/function_parser/function_parser/parsers/javascript_parser.py
+++ b/function_parser/function_parser/parsers/javascript_parser.py
@ -0,0 +1,84 @@
+from typing import List, Dict, Any
+
+from parsers.language_parser import LanguageParser, match_from_span, tokenize_code, traverse_type, previous_sibling, \
+    node_parent
+from parsers.commentutils import get_docstring_summary, strip_c_style_comment_delimiters
+
+
+class JavascriptParser(LanguageParser):
+
+    FILTER_PATHS = ('test', 'node_modules')
+
+    BLACKLISTED_FUNCTION_NAMES = {'toString', 'toLocaleString', 'valueOf'}
+
+    @staticmethod
+    def get_docstring(tree, node, blob: str) -> str:
+        docstring = ''
+        parent_node = node_parent(tree, node)
+
+        if parent_node.type == 'variable_declarator':
+            base_node = node_parent(tree, parent_node)  # Get the variable declaration
+        elif parent_node.type == 'pair':
+            base_node = parent_node  # This is a common pattern where a function is assigned as a value to a dictionary.
+        else:
+            base_node = node
+
+        prev_sibling = previous_sibling(tree, base_node)
+        if prev_sibling is not None and prev_sibling.type == 'comment':
+            all_prev_comment_nodes = [prev_sibling]
+            prev_sibling = previous_sibling(tree, prev_sibling)
+            while prev_sibling is not None and prev_sibling.type == 'comment':
+                all_prev_comment_nodes.append(prev_sibling)
+                last_comment_start_line = prev_sibling.start_point[0]
+                prev_sibling = previous_sibling(tree, prev_sibling)
+                if prev_sibling.end_point[0] + 1 < last_comment_start_line:
+                    break  # if there is an empty line, stop expanding.
+
+            docstring = ' '.join((strip_c_style_comment_delimiters(match_from_span(s, blob)) for s in all_prev_comment_nodes[::-1]))
+        return docstring
+        
+    @staticmethod
+    def get_definition(tree, blob: str) -> List[Dict[str, Any]]:
+        function_nodes = []
+        functions = []
+        traverse_type(tree.root_node, function_nodes, 'function')
+        for function in function_nodes:
+            if function.children is None or len(function.children) == 0:
+                continue
+            parent_node = node_parent(tree, function)
+            functions.append((parent_node.type, function, JavascriptParser.get_docstring(tree, function, blob)))
+
+        definitions = []
+        for node_type, function_node, docstring in functions:
+            metadata = JavascriptParser.get_function_metadata(function_node, blob)
+            docstring_summary = get_docstring_summary(docstring)
+
+            if metadata['identifier'] in JavascriptParser.BLACKLISTED_FUNCTION_NAMES:
+                continue
+            definitions.append({
+                'type': node_type,
+                'identifier': metadata['identifier'],
+                'parameters': metadata['parameters'],
+                'function': match_from_span(function_node, blob),
+                'function_tokens': tokenize_code(function_node, blob),
+                'docstring': docstring,
+                'docstring_summary': docstring_summary,
+                'start_point': function_node.start_point,
+                'end_point': function_node.end_point     
+            })
+        return definitions
+
+
+    @staticmethod
+    def get_function_metadata(function_node, blob: str) -> Dict[str, str]:
+        metadata = {
+            'identifier': '',
+            'parameters': '',
+        }
+        identifier_nodes = [child for child in function_node.children if child.type == 'identifier']
+        formal_parameters_nodes = [child for child in function_node.children if child.type == 'formal_parameters']
+        if identifier_nodes:
+            metadata['identifier'] = match_from_span(identifier_nodes[0], blob)
+        if formal_parameters_nodes:
+            metadata['parameters'] = match_from_span(formal_parameters_nodes[0], blob)
+        return metadata
--- a/function_parser/function_parser/parsers/language_parser.py
+++ b/function_parser/function_parser/parsers/language_parser.py
@ -0,0 +1,106 @@
+import re
+from abc import ABC, abstractmethod
+from typing import List, Dict, Any, Set, Optional
+
+DOCSTRING_REGEX_TOKENIZER = re.compile(r"[^\s,'\"`.():\[\]=*;>{\}+-/\\]+|\\+|\.+|\(\)|{\}|\[\]|\(+|\)+|:+|\[+|\]+|{+|\}+|=+|\*+|;+|>+|\++|-+|/+")
+
+
+def tokenize_docstring(docstring: str) -> List[str]:
+    return [t for t in DOCSTRING_REGEX_TOKENIZER.findall(docstring) if t is not None and len(t) > 0]
+
+
+def tokenize_code(node, blob: str, nodes_to_exclude: Optional[Set]=None) -> List:
+    tokens = []
+    traverse(node, tokens)
+    return [match_from_span(token, blob) for token in tokens if nodes_to_exclude is None or token not in nodes_to_exclude]
+
+
+def traverse(node, results: List) -> None:
+    if node.type == 'string':
+        results.append(node)
+        return
+    for n in node.children:
+        traverse(n, results)
+    if not node.children:
+        results.append(node)
+
+def nodes_are_equal(n1, n2):
+    return n1.type == n2.type and n1.start_point == n2.start_point and n1.end_point == n2.end_point
+
+def previous_sibling(tree, node):
+    """
+    Search for the previous sibling of the node.
+
+    TODO: C TreeSitter should support this natively, but not its Python bindings yet. Replace later.
+    """
+    to_visit = [tree.root_node]
+    while len(to_visit) > 0:
+        next_node = to_visit.pop()
+        for i, node_at_i in enumerate(next_node.children):
+            if nodes_are_equal(node, node_at_i):
+                if i > 0:
+                    return next_node.children[i-1]
+                return None
+        else:
+            to_visit.extend(next_node.children)
+    return ValueError("Could not find node in tree.")
+
+
+def node_parent(tree, node):
+    to_visit = [tree.root_node]
+    while len(to_visit) > 0:
+        next_node = to_visit.pop()
+        for child in next_node.children:
+            if nodes_are_equal(child, node):
+                return next_node
+        else:
+            to_visit.extend(next_node.children)
+    raise ValueError("Could not find node in tree.")
+
+
+def match_from_span(node, blob: str) -> str:
+    lines = blob.split('\n')
+    line_start = node.start_point[0]
+    line_end = node.end_point[0]
+    char_start = node.start_point[1]
+    char_end = node.end_point[1]
+    if line_start != line_end:
+        return '\n'.join([lines[line_start][char_start:]] + lines[line_start+1:line_end] + [lines[line_end][:char_end]])
+    else:
+        return lines[line_start][char_start:char_end]
+
+
+def traverse_type(node, results: List, kind: str) -> None:
+    if node.type == kind:
+        results.append(node)
+    if not node.children:
+        return
+    for n in node.children:
+        traverse_type(n, results, kind)
+
+
+class LanguageParser(ABC):
+    @staticmethod
+    @abstractmethod
+    def get_definition(tree, blob: str) -> List[Dict[str, Any]]:
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def get_class_metadata(class_node, blob):
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def get_function_metadata(function_node, blob) -> Dict[str, str]:
+        pass
+
+    @staticmethod
+    @abstractmethod
+    def get_context(tree, blob):
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def get_calls(tree, blob):
+        raise NotImplementedError
--- a/function_parser/function_parser/parsers/php_parser.py
+++ b/function_parser/function_parser/parsers/php_parser.py
@ -0,0 +1,77 @@
+from typing import List, Dict, Any
+
+from parsers.language_parser import LanguageParser, match_from_span, tokenize_code, traverse_type
+from parsers.commentutils import strip_c_style_comment_delimiters, get_docstring_summary
+
+
+class PhpParser(LanguageParser):
+
+    FILTER_PATHS = ('test', 'tests')
+
+    BLACKLISTED_FUNCTION_NAMES = {'__construct', '__destruct', '__call', '__callStatic',
+                                  '__get', '__set', '__isset()', '__unset',
+                                  '__sleep', '__wakeup', '__toString', '__invoke',
+                                  '__set_state', '__clone', '__debugInfo'}
+
+    @staticmethod
+    def get_docstring(trait_node, blob: str, idx: int) -> str:
+        docstring = ''
+        if idx - 1 >= 0 and trait_node.children[idx-1].type == 'comment':
+            docstring = match_from_span(trait_node.children[idx-1], blob)
+            docstring = strip_c_style_comment_delimiters(docstring)
+        return docstring
+
+
+    @staticmethod
+    def get_declarations(declaration_node, blob: str, node_type: str) -> List[Dict[str, Any]]:
+        declarations = []
+        for idx, child in enumerate(declaration_node.children):
+            if child.type == 'name':
+                declaration_name = match_from_span(child, blob)
+            elif child.type == 'method_declaration':
+                docstring = PhpParser.get_docstring(declaration_node, blob, idx)
+                docstring_summary = get_docstring_summary(docstring)
+                function_nodes = []
+                traverse_type(child, function_nodes, 'function_definition')
+                if function_nodes:
+                    function_node = function_nodes[0]
+                    metadata = PhpParser.get_function_metadata(function_node, blob)
+
+                    if metadata['identifier'] in PhpParser.BLACKLISTED_FUNCTION_NAMES:
+                        continue
+
+                    declarations.append({
+                        'type': node_type,
+                        'identifier': '{}.{}'.format(declaration_name, metadata['identifier']),
+                        'parameters': metadata['parameters'],
+                        'function': match_from_span(child, blob),
+                        'function_tokens': tokenize_code(child, blob),
+                        'docstring': docstring,
+                        'docstring_summary': docstring_summary,
+                        'start_point': function_node.start_point,
+                        'end_point': function_node.end_point
+                    })
+        return declarations
+
+
+    @staticmethod
+    def get_definition(tree, blob: str) -> List[Dict[str, Any]]:
+        trait_declarations = [child for child in tree.root_node.children if child.type == 'trait_declaration']
+        class_declarations = [child for child in tree.root_node.children if child.type == 'class_declaration']
+        definitions = []
+        for trait_declaration in trait_declarations:
+            definitions.extend(PhpParser.get_declarations(trait_declaration, blob, trait_declaration.type))
+        for class_declaration in class_declarations:
+            definitions.extend(PhpParser.get_declarations(class_declaration, blob, class_declaration.type))        
+        return definitions
+
+
+    @staticmethod
+    def get_function_metadata(function_node, blob: str) -> Dict[str, str]:
+        metadata = {
+            'identifier': '',
+            'parameters': '',
+        }
+        metadata['identifier'] = match_from_span(function_node.children[1], blob)
+        metadata['parameters'] = match_from_span(function_node.children[2], blob)
+        return metadata
--- a/function_parser/function_parser/parsers/python_parser.py
+++ b/function_parser/function_parser/parsers/python_parser.py
@ -0,0 +1,208 @@
+from typing import Dict, Iterable, Optional, Iterator, Any, List
+
+from parsers.language_parser import LanguageParser, match_from_span, tokenize_code, traverse_type
+from parsers.commentutils import get_docstring_summary
+
+
+class PythonParser(LanguageParser):
+
+    FILTER_PATHS = ('test',)
+    STOPWORDS = ()
+
+    # Get function calls
+    @staticmethod
+    def get_context(tree, blob):
+        def _get_import_from(import_from_statement, blob):
+            context = {}
+            mode = 'from'
+            library = ''
+            for n in import_from_statement.children:
+                if n.type == 'from':
+                    mode = 'from'
+                elif n.type == 'import':
+                    mode = 'import'
+                elif n.type == 'dotted_name':
+                    if mode == 'from':
+                        library = match_from_span(n, blob).strip()
+                    elif mode == 'import':
+                        if library:
+                            context[match_from_span(n, blob).strip().strip(',')] = library
+            return context
+
+        def _get_import(import_statement, blob):
+            context = []
+            for n in import_statement.children:
+                if n.type == 'dotted_name':
+                    context.append(match_from_span(n, blob).strip())
+                if n.type == 'aliased_import':
+                    for a in n.children:
+                        if a.type == 'dotted_name':
+                            context.append(match_from_span(a, blob).strip())
+            return context
+
+        import_from_statements = []
+        traverse_type(tree.root_node, import_from_statements, 'import_from_statement')
+
+        import_statements = []
+        traverse_type(tree.root_node, import_statements, 'import_statement')
+
+        context = []
+        context.extend((_get_import_from(i, blob) for i in import_from_statements))
+        context.extend((_get_import(i, blob) for i in import_statements))
+        return context
+
+    @staticmethod
+    def get_calls(tree, blob):
+        calls = []
+        traverse_type(tree.root_node, calls, 'call')
+
+        def _traverse_calls(node, identifiers):
+            if node.type == 'identifier':
+                identifiers.append(node)
+            if not node.children or node.type == 'argument_list':
+                return
+            for n in node.children:
+                _traverse_calls(n, identifiers)
+
+        results = []
+        for call in calls:
+            identifiers = []
+            _traverse_calls(call, identifiers)
+
+            if identifiers:
+                identifier = identifiers[-1]
+                argument_lists = [n for n in call.children if n.type == 'argument_list']
+                argument_list = ''
+                if argument_lists:
+                    argument_list = match_from_span(argument_lists[-1], blob)
+                results.append({
+                    'identifier': match_from_span(identifier, blob),
+                    'argument_list': argument_list,
+                    'start_point': identifier.start_point,
+                    'end_point': identifier.end_point,
+                })
+        return results
+
+    @staticmethod
+    def __get_docstring_node(function_node):
+        docstring_node = [node for node in function_node.children if
+                          node.type == 'expression_statement' and node.children[0].type == 'string']
+        if len(docstring_node) > 0:
+            return docstring_node[0].children[0]
+        return None
+
+    @staticmethod
+    def get_docstring(docstring_node, blob: str) -> str:
+        docstring = ''
+        if docstring_node is not None:
+            docstring = match_from_span(docstring_node, blob)
+            docstring = docstring.strip().strip('"').strip("'")
+        return docstring
+
+    @staticmethod
+    def get_function_metadata(function_node, blob: str) -> Dict[str, str]:
+        metadata = {
+            'identifier': '',
+            'parameters': '',
+            'return_statement': ''
+        }
+        is_header = False
+        for child in function_node.children:
+            if is_header:
+                if child.type == 'identifier':
+                    metadata['identifier'] = match_from_span(child, blob)
+                elif child.type == 'parameters':
+                    metadata['parameters'] = match_from_span(child, blob)
+            if child.type == 'def':
+                is_header = True
+            elif child.type == ':':
+                is_header = False
+            elif child.type == 'return_statement':
+                metadata['return_statement'] = match_from_span(child, blob)
+        return metadata
+
+    @staticmethod
+    def get_class_metadata(class_node, blob: str) -> Dict[str, str]:
+        metadata = {
+            'identifier': '',
+            'argument_list': '',
+        }
+        is_header = False
+        for child in class_node.children:
+            if is_header:
+                if child.type == 'identifier':
+                    metadata['identifier'] = match_from_span(child, blob)
+                elif child.type == 'argument_list':
+                    metadata['argument_list'] = match_from_span(child, blob)
+            if child.type == 'class':
+                is_header = True
+            elif child.type == ':':
+                break
+        return metadata
+
+    @staticmethod
+    def is_function_empty(function_node) -> bool:
+        seen_header_end = False
+        for child in function_node.children:
+            if seen_header_end and (child.type=='pass_statement' or child.type=='raise_statement'):
+                return True
+            elif seen_header_end:
+                return False
+
+            if child.type == ':':
+                seen_header_end = True
+        return False
+
+    @staticmethod
+    def __process_functions(functions: Iterable, blob: str, func_identifier_scope: Optional[str]=None) -> Iterator[Dict[str, Any]]:
+        for function_node in functions:
+            if PythonParser.is_function_empty(function_node):
+                continue
+            function_metadata = PythonParser.get_function_metadata(function_node, blob)
+            if func_identifier_scope is not None:
+                function_metadata['identifier'] = '{}.{}'.format(func_identifier_scope,
+                                                                 function_metadata['identifier'])
+                if function_metadata['identifier'].startswith('__') and function_metadata['identifier'].endswith('__'):
+                    continue  # Blacklist built-in functions
+            docstring_node = PythonParser.__get_docstring_node(function_node)
+            function_metadata['docstring'] = PythonParser.get_docstring(docstring_node, blob)
+            function_metadata['docstring_summary'] = get_docstring_summary(function_metadata['docstring'])
+            function_metadata['function'] = match_from_span(function_node, blob)
+            function_metadata['function_tokens'] = tokenize_code(function_node, blob, {docstring_node})
+            function_metadata['start_point'] = function_node.start_point
+            function_metadata['end_point'] = function_node.end_point
+
+            yield function_metadata
+
+    @staticmethod
+    def get_function_definitions(node):
+        for child in node.children:
+            if child.type == 'function_definition':
+                yield child
+            elif child.type == 'decorated_definition':
+                for c in child.children:
+                    if c.type == 'function_definition':
+                        yield c
+
+    @staticmethod
+    def get_definition(tree, blob: str) -> List[Dict[str, Any]]:
+        functions = PythonParser.get_function_definitions(tree.root_node)
+        classes = (node for node in tree.root_node.children if node.type == 'class_definition')
+
+        definitions = list(PythonParser.__process_functions(functions, blob))
+
+        for _class in classes:
+            class_metadata = PythonParser.get_class_metadata(_class, blob)
+            docstring_node = PythonParser.__get_docstring_node(_class)
+            class_metadata['docstring'] = PythonParser.get_docstring(docstring_node, blob)
+            class_metadata['docstring_summary'] = get_docstring_summary(class_metadata['docstring'])
+            class_metadata['function'] = ''
+            class_metadata['function_tokens'] = []
+            class_metadata['start_point'] = _class.start_point
+            class_metadata['end_point'] = _class.end_point
+            definitions.append(class_metadata)
+
+            functions = PythonParser.get_function_definitions(_class)
+            definitions.extend(PythonParser.__process_functions(functions, blob, class_metadata['identifier']))
+
+        return definitions
--- a/function_parser/function_parser/parsers/ruby_parser.py
+++ b/function_parser/function_parser/parsers/ruby_parser.py
@ -0,0 +1,77 @@
+from typing import List, Dict, Any
+
+from parsers.language_parser import LanguageParser, match_from_span, tokenize_code
+from parsers.commentutils import get_docstring_summary
+
+
+class RubyParser(LanguageParser):
+
+    FILTER_PATHS = ('test', 'vendor')
+
+    BLACKLISTED_FUNCTION_NAMES = {'initialize', 'to_text', 'display', 'dup', 'clone', 'equal?', '==', '<=>',
+                                  '===', '<=', '<', '>', '>=', 'between?', 'eql?', 'hash'}
+
+    @staticmethod
+    def get_docstring(trait_node, blob: str, idx: int) -> str:
+        raise NotImplementedError("Not used for Ruby.")
+
+
+    @staticmethod
+    def get_methods(module_or_class_node, blob: str, module_name: str, node_type: str) -> List[Dict[str, Any]]:
+        definitions = []
+        comment_buffer = []
+        module_or_class_name = match_from_span(module_or_class_node.children[1], blob)
+        for child in module_or_class_node.children:
+            if child.type == 'comment':
+                comment_buffer.append(child)
+            elif child.type == 'method':
+                docstring = '\n'.join([match_from_span(comment, blob).strip().strip('#') for comment in comment_buffer])
+                docstring_summary = get_docstring_summary(docstring)
+
+                metadata = RubyParser.get_function_metadata(child, blob)
+                if metadata['identifier'] in RubyParser.BLACKLISTED_FUNCTION_NAMES:
+                    continue
+                definitions.append({
+                    'type': 'class',
+                    'identifier': '{}.{}.{}'.format(module_name, module_or_class_name, metadata['identifier']),
+                    'parameters': metadata['parameters'],
+                    'function': match_from_span(child, blob),
+                    'function_tokens': tokenize_code(child, blob),
+                    'docstring': docstring,
+                    'docstring_summary': docstring_summary,
+                    'start_point': child.start_point,
+                    'end_point': child.end_point
+                })
+                comment_buffer = []
+            else:
+                comment_buffer = []
+        return definitions
+
+
+    @staticmethod
+    def get_definition(tree, blob: str) -> List[Dict[str, Any]]:
+        definitions = []
+        if 'ERROR' not in set([child.type for child in tree.root_node.children]):
+            modules = [child for child in tree.root_node.children if child.type == 'module']
+            for module in modules:
+                if module.children:
+                    module_name = match_from_span(module.children[1], blob)
+                    sub_modules = [child for child in module.children if child.type == 'module' and child.children]
+                    classes = [child for child in module.children if child.type == 'class']
+                    for sub_module_node in sub_modules:
+                        definitions.extend(RubyParser.get_methods(sub_module_node, blob, module_name, sub_module_node.type))
+                    for class_node in classes:
+                        definitions.extend(RubyParser.get_methods(class_node, blob, module_name, class_node.type))
+        return definitions
+
+
+    @staticmethod
+    def get_function_metadata(function_node, blob: str) -> Dict[str, str]:
+        metadata = {
+            'identifier': '',
+            'parameters': '',
+        }
+        metadata['identifier'] = match_from_span(function_node.children[1], blob)
+        if function_node.children[2].type == 'method_parameters':
+            metadata['parameters'] = match_from_span(function_node.children[2], blob)
+        return metadata
--- a/function_parser/function_parser/process.py
+++ b/function_parser/function_parser/process.py
@ -0,0 +1,232 @@
+"""
+Usage:
+    process.py [options] INPUT_DIR OUTPUT_DIR
+
+Options:
+    -h --help
+    --language LANGUAGE             Language
+    --processes PROCESSES           # of processes to use [default: 16]
+    --license-filter FILE           License metadata to filter, every row contains [nwo, license, language, score] (e.g. ['pandas-dev/pandas', 'bsd-3-clause', 'Python', 0.9997])
+    --tree-sitter-build FILE        [default: /src/build/py-tree-sitter-languages.so]
+"""
+import functools
+from multiprocessing import Pool
+import pickle
+from os import PathLike
+from typing import Optional, Tuple, Type, List, Dict, Any
+
+from docopt import docopt
+from dpu_utils.codeutils.deduplication import DuplicateDetector
+import pandas as pd
+from tree_sitter import Language, Parser
+
+from language_data import LANGUAGE_METADATA
+from parsers.language_parser import LanguageParser, tokenize_docstring
+from utils import download, get_sha, flatten, remap_nwo, walk
+
+class DataProcessor:
+
+    PARSER = Parser()
+
+    def __init__(self, language: str, language_parser: Type[LanguageParser]):
+        self.language = language
+        self.language_parser = language_parser
+
+    def process_dee(self, nwo, ext) -> List[Dict[str, Any]]:
+        # Process dependees (libraries) to get function implementations
+        indexes = []
+        _, nwo = remap_nwo(nwo)
+        if nwo is None:
+            return indexes
+
+        tmp_dir = download(nwo)
+        files = walk(tmp_dir, ext)
+        # files = glob.iglob(tmp_dir.name + '/**/*.{}'.format(ext), recursive=True)
+        sha = None
+
+        for f in files:
+            definitions = self.get_function_definitions(f)
+            if definitions is None:
+                continue
+            if sha is None:
+                sha = get_sha(tmp_dir, nwo)
+
+            nwo, path, functions = definitions
+            indexes.extend((self.extract_function_data(func, nwo, path, sha) for func in functions if len(func['function_tokens']) > 1))
+        return indexes
+
+    def process_dent(self, nwo, ext, library_candidates) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]]]:
+        # Process dependents (applications) to get function calls
+        dents = []
+        edges = []
+        _, nwo = remap_nwo(nwo)
+        if nwo is None:
+            return dents, edges
+
+        tmp_dir = download(nwo)
+        files = walk(tmp_dir, ext)
+        sha = None
+
+        for f in files:
+            context_and_calls = self.get_context_and_function_calls(f)
+            if context_and_calls is None:
+                continue
+            if sha is None:
+                sha = get_sha(tmp_dir, nwo)
+
+            nwo, path, context, calls = context_and_calls
+            libraries = []
+            for cxt in context:
+                if type(cxt) == dict:
+                    libraries.extend([v.split('.')[0] for v in cxt.values()])
+                elif type(cxt) == list:
+                    libraries.extend(cxt)
+
+            match_scopes = {}
+            for cxt in set(libraries):
+                if cxt in library_candidates:
+                    match_scopes[cxt] = library_candidates[cxt]
+
+            for call in calls:
+                for depended_library_name, dependend_library_functions in match_scopes.items():
+                    for depended_library_function in dependend_library_functions:
+                        # Other potential filters: len(call['identifier']) > 6 or len(call['identifier'].split('_')) > 1
+                        if (call['identifier'] not in self.language_parser.STOPWORDS and
+                            ((depended_library_function['identifier'].split('.')[-1] == '__init__' and
+                              call['identifier'] == depended_library_function['identifier'].split('.')[0]) or
+                             ((len(call['identifier']) > 9 or
+                               (not call['identifier'].startswith('_') and len(call['identifier'].split('_')) > 1)) and
+                              call['identifier'] == depended_library_function['identifier'])
+                            )):
+                            dent = {
+                                'nwo': nwo,
+                                'sha': sha,
+                                'path': path,
+                                'language': self.language,
+                                'identifier': call['identifier'],
+                                'argument_list': call['argument_list'],
+                                'url': 'https://github.com/{}/blob/{}/{}#L{}-L{}'.format(nwo, sha, path,
+                                                                                         call['start_point'][0] + 1,
+                                                                                         call['end_point'][0] + 1)
+                            }
+                            dents.append(dent)
+                            edges.append((dent['url'], depended_library_function['url']))
+        return dents, edges
+
+    def process_single_file(self, filepath: PathLike) -> List[Dict[str, Any]]:
+        definitions = self.get_function_definitions(filepath)
+        if definitions is None:
+            return []
+        _, _, functions = definitions
+
+        return [self.extract_function_data(func, '', '', '') for func in functions if len(func['function_tokens']) > 1]
+
+    def extract_function_data(self, function: Dict[str, Any], nwo, path: str, sha: str):
+        return {
+            'nwo': nwo,
+            'sha': sha,
+            'path': path,
+            'language': self.language,
+            'identifier': function['identifier'],
+            'parameters': function.get('parameters', ''),
+            'argument_list': function.get('argument_list', ''),
+            'return_statement': function.get('return_statement', ''),
+            'docstring': function['docstring'].strip(),
+            'docstring_summary': function['docstring_summary'].strip(),
+            'docstring_tokens': tokenize_docstring(function['docstring_summary']),
+            'function': function['function'].strip(),
+            'function_tokens': function['function_tokens'],
+            'url': 'https://github.com/{}/blob/{}/{}#L{}-L{}'.format(nwo, sha, path, function['start_point'][0] + 1,
+                                                                     function['end_point'][0] + 1)
+        }
+
+    def get_context_and_function_calls(self, filepath: str) -> Optional[Tuple[str, str, List, List]]:
+        nwo = '/'.join(filepath.split('/')[3:5])
+        path = '/'.join(filepath.split('/')[5:])
+        if any(fp in path.lower() for fp in self.language_parser.FILTER_PATHS):
+            return None
+        try:
+            with open(filepath) as source_code:
+                blob = source_code.read()
+            tree = DataProcessor.PARSER.parse(blob.encode())
+            return (nwo, path, self.language_parser.get_context(tree, blob), self.language_parser.get_calls(tree, blob))
+        except (UnicodeDecodeError, FileNotFoundError, IsADirectoryError, ValueError, OSError):
+            return None
+
+    def get_function_definitions(self, filepath: str) -> Optional[Tuple[str, str, List]]:
+        nwo = '/'.join(filepath.split('/')[3:5])
+        path = '/'.join(filepath.split('/')[5:])
+        if any(fp in path.lower() for fp in self.language_parser.FILTER_PATHS):
+            return None
+        try:
+            with open(filepath) as source_code:
+                blob = source_code.read()
+            tree = DataProcessor.PARSER.parse(blob.encode())
+            return (nwo, path, self.language_parser.get_definition(tree, blob))
+        except (UnicodeDecodeError, FileNotFoundError, IsADirectoryError, ValueError, OSError):
+            return None
+
+
+if __name__ == '__main__':
+    args = docopt(__doc__)
+
+    repository_dependencies = pd.read_csv(args['INPUT_DIR'] + 'repository_dependencies-1.4.0-2018-12-22.csv', index_col=False)
+    projects = pd.read_csv(args['INPUT_DIR'] + 'projects_with_repository_fields-1.4.0-2018-12-22.csv', index_col=False)
+
+    repository_dependencies['Manifest Platform'] = repository_dependencies['Manifest Platform'].apply(lambda x: x.lower())
+    id_to_nwo = {project['ID']: project['Repository Name with Owner'] for project in projects[['ID', 'Repository Name with Owner']].dropna().to_dict(orient='records')}
+    nwo_to_name = {project['Repository Name with Owner']: project['Name'] for project in projects[['Repository Name with Owner', 'Name']].dropna().to_dict(orient='records')}
+
+    filtered = repository_dependencies[(repository_dependencies['Host Type'] == 'GitHub') & (repository_dependencies['Manifest Platform'] == LANGUAGE_METADATA[args['--language']]['platform'])][['Repository Name with Owner', 'Dependency Project ID']].dropna().to_dict(orient='records')
+
+    dependency_pairs = [(rd['Repository Name with Owner'], id_to_nwo[int(rd['Dependency Project ID'])])
+                        for rd in filtered if int(rd['Dependency Project ID']) in id_to_nwo]
+
+    dependency_pairs = list(set(dependency_pairs))
+
+    dents, dees = zip(*dependency_pairs)
+    # dents = list(set(dents))
+    dees = list(set(dees))
+
+    DataProcessor.PARSER.set_language(Language(args['--tree-sitter-build'], args['--language']))
+
+    processor = DataProcessor(language=args['--language'],
+                              language_parser=LANGUAGE_METADATA[args['--language']]['language_parser'])
+
+    with Pool(processes=int(args['--processes'])) as pool:
+        output = pool.imap_unordered(functools.partial(processor.process_dee,
+                                                       ext=LANGUAGE_METADATA[args['--language']]['ext']),
+                                     dees)
+
+    definitions = list(flatten(output))
+    with open(args['OUTPUT_DIR'] + '{}_definitions.pkl'.format(args['--language']), 'wb') as f:
+        pickle.dump(definitions, f)
+
+    license_filter_file = args.get('--license-filter')
+    if license_filter_file is not None:
+        with open(license_filter_file, 'rb') as f:
+            license_filter = pickle.load(f)
+        valid_nwos = dict([(l[0], l[3]) for l in license_filter])
+
+        # Sort function definitions with repository popularity
+        definitions = [dict(list(d.items()) + [('score', valid_nwos[d['nwo']])]) for d in definitions if d['nwo'] in valid_nwos]
+        definitions = sorted(definitions, key=lambda x: -x['score'])
+
+        # dedupe
+        seen = set()
+        filtered = []
+        for d in definitions:
+            if ' '.join(d['function_tokens']) not in seen:
+                filtered.append(d)
+                seen.add(' '.join(d['function_tokens']))
+
+        dd = DuplicateDetector(min_num_tokens_per_document=10)
+        filter_mask = [dd.add_file(id=idx,
+                                   tokens=d['function_tokens'],
+                                   language=d['language']) for idx, d in enumerate(filtered)]
+        exclusion_set = dd.compute_ids_to_exclude()
+        exclusion_mask = [idx not in exclusion_set for idx, _ in enumerate(filtered)]
+        filtered = [d for idx, d in enumerate(filtered) if filter_mask[idx] & exclusion_mask[idx]]
+
+        with open(args['OUTPUT_DIR'] + '{}_dedupe_definitions.pkl'.format(args['--language']), 'wb') as f:
+            pickle.dump(filtered, f)
--- a/function_parser/function_parser/process_calls.py
+++ b/function_parser/function_parser/process_calls.py
@ -0,0 +1,73 @@
+"""
+Usage:
+    process_calls.py [options] INPUT_DIR DEFINITION_FILE OUTPUT_DIR
+
+Options:
+    -h --help
+    --language LANGUAGE             Language
+    --processes PROCESSES           # of processes to use [default: 16]
+    --tree-sitter-build FILE        [default: /src/build/py-tree-sitter-languages.so]
+"""
+from collections import Counter, defaultdict
+import functools
+import gzip
+from multiprocessing import Pool
+import pandas as pd
+import pickle
+
+from docopt import docopt
+from tree_sitter import Language
+
+from language_data import LANGUAGE_METADATA
+from process import DataProcessor
+
+
+if __name__ == '__main__':
+    args = docopt(__doc__)
+
+    repository_dependencies = pd.read_csv(args['INPUT_DIR'] + 'repository_dependencies-1.4.0-2018-12-22.csv', index_col=False)
+    projects = pd.read_csv(args['INPUT_DIR'] + 'projects_with_repository_fields-1.4.0-2018-12-22.csv', index_col=False)
+
+    repository_dependencies['Manifest Platform'] = repository_dependencies['Manifest Platform'].apply(lambda x: x.lower())
+    id_to_nwo = {project['ID']: project['Repository Name with Owner'] for project in projects[['ID', 'Repository Name with Owner']].dropna().to_dict(orient='records')}
+    nwo_to_name = {project['Repository Name with Owner']: project['Name'] for project in projects[['Repository Name with Owner', 'Name']].dropna().to_dict(orient='records')}
+
+    filtered = repository_dependencies[(repository_dependencies['Host Type'] == 'GitHub') & (repository_dependencies['Manifest Platform'] == LANGUAGE_METADATA[args['--language']]['platform'])][['Repository Name with Owner', 'Dependency Project ID']].dropna().to_dict(orient='records')
+
+    dependency_pairs = [(rd['Repository Name with Owner'], id_to_nwo[int(rd['Dependency Project ID'])])
+                        for rd in filtered if int(rd['Dependency Project ID']) in id_to_nwo]
+
+    dependency_pairs = list(set(dependency_pairs))
+
+    dents, dees = zip(*dependency_pairs)
+    dents = list(set(dents))
+
+    definitions = defaultdict(list)
+    with open(args['DEFINITION_FILE'], 'rb') as f:
+        for d in pickle.load(f)
+            definitions[d['nwo']].append(d)
+    definitions = dict(definitions)
+
+    # Fill candidates from most depended libraries
+    c = Counter(dees)
+    library_candidates = {}
+    for nwo, _ in c.most_common(len(c)):
+        if nwo.split('/')[-1] not in library_candidates and nwo in definitions:
+            # Approximate library name with the repository name from nwo
+            library_candidates[nwo.split('/')[-1]] = definitions[nwo]
+
+    DataProcessor.PARSER.set_language(Language(args['--tree-sitter-build'], args['--language']))
+    processor = DataProcessor(language=args['--language'],
+                              language_parser=LANGUAGE_METADATA[args['--language']]['language_parser'])
+
+    with Pool(processes=int(args['--processes'])) as pool:
+        output = pool.imap_unordered(functools.partial(processor.process_dent,
+                                                       ext=LANGUAGE_METADATA[args['--language']]['ext']),
+                                     dents)
+
+    dent_definitions, edges = map(list, map(flatten, zip(*output)))
+
+    with gzip.GzipFile(args['OUTPUT_DIR'] + '{}_dent_definitions.pkl.gz'.format(args['--language']), 'wb') as outfile:
+        pickle.dump(dent_definitions, outfile)
+    with gzip.GzipFile(args['OUTPUT_DIR'] + '{}_edges.pkl.gz'.format(args['--language']), 'wb') as outfile:
+        pickle.dump(edges, outfile)
--- a/function_parser/function_parser/utils.py
+++ b/function_parser/function_parser/utils.py
@ -0,0 +1,59 @@
+import itertools
+import os
+import re
+import subprocess
+import tempfile
+from typing import List, Tuple
+
+import requests
+
+
+def flatten(l):
+    """Flatten list of lists.
+    Args:
+        l: A list of lists
+    Returns: A flattened iterable
+    """
+    return itertools.chain.from_iterable(l)
+
+
+def chunks(l: List, n: int):
+    """Yield successive n-sized chunks from l."""
+    for i in range(0, len(l), n):
+        yield l[i:i + n]
+
+
+def remap_nwo(nwo: str) -> Tuple[str, str]:
+    r = requests.get('https://github.com/{}'.format(nwo))
+    if r.status_code not in (404, 451, 502): # DMCA
+        if 'migrated' not in r.text:
+            if r.history:
+                return (nwo, '/'.join(re.findall(r'"https://github.com/.+"', r.history[0].text)[0].strip('"').split('/')[-2:]))
+            return (nwo, nwo)
+    return (nwo, None)
+
+
+def get_sha(tmp_dir: tempfile.TemporaryDirectory, nwo: str):
+    os.chdir(os.path.join(tmp_dir.name, nwo))
+    # git rev-parse HEAD
+    cmd = ['git', 'rev-parse', 'HEAD']
+    sha = subprocess.check_output(cmd).strip().decode('utf-8')
+    os.chdir('/tmp')
+    return sha
+
+
+def download(nwo: str):
+    os.environ['GIT_TERMINAL_PROMPT'] = '0'
+    tmp_dir = tempfile.TemporaryDirectory()
+    cmd = ['git', 'clone', '--depth=1', 'https://github.com/{}.git'.format(nwo), '{}/{}'.format(tmp_dir.name, nwo)]
+    subprocess.run(cmd, stdin=subprocess.DEVNULL, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
+    return tmp_dir
+
+
+def walk(tmp_dir: tempfile.TemporaryDirectory, ext: str):
+    results = []
+    for root, _, files in os.walk(tmp_dir.name):
+        for f in files:
+            if f.endswith('.' + ext):
+                results.append(os.path.join(root, f))
+    return results
--- a/function_parser/script/bootstrap
+++ b/function_parser/script/bootstrap
@ -0,0 +1,6 @@
+#!/bin/sh
+
+set -e
+cd $(dirname "$0")/..
+
+docker build -t function-parser .
--- a/function_parser/script/console
+++ b/function_parser/script/console
@ -0,0 +1,6 @@
+#!/bin/sh
+
+set -e
+cd $(dirname "$0")/..
+
+docker run -it --network=host -v $(pwd):/src/function-parser function-parser bash
--- a/function_parser/script/server
+++ b/function_parser/script/server
@ -0,0 +1,6 @@
+#!/bin/sh
+
+set -e
+cd $(dirname "$0")/..
+
+docker run -p 8888:8888 -v $(pwd):/src/function-parser function-parser
--- a/function_parser/script/setup
+++ b/function_parser/script/setup
@ -0,0 +1,10 @@
+#!/bin/sh
+
+set -e
+cd $(dirname "$0")/..
+
+mkdir -p data
+cd data
+wget https://zenodo.org/record/2536573/files/Libraries.io-open-data-1.4.0.tar.gz
+tar -zxvf Libraries.io-open-data-1.4.0.tar.gz
+cd ..
--- a/function_parser/script/setup.py
+++ b/function_parser/script/setup.py
@ -0,0 +1,22 @@
+import glob
+
+from tree_sitter import Language
+
+languages = [
+    '/src/vendor/tree-sitter-python',
+    '/src/vendor/tree-sitter-javascript',
+    # '/src/vendor/tree-sitter-typescript/typescript',
+    # '/src/vendor/tree-sitter-typescript/tsx',
+    '/src/vendor/tree-sitter-go',
+    '/src/vendor/tree-sitter-ruby',
+    '/src/vendor/tree-sitter-java',
+    '/src/vendor/tree-sitter-cpp',
+    '/src/vendor/tree-sitter-php',
+]
+
+Language.build_library(
+    # Store the library in the directory
+    '/src/build/py-tree-sitter-languages.so',
+    # Include one or more languages
+    languages
+)
--- a/images/architecture.png
+++ b/images/architecture.png
--- a/notebooks/ExploreData.ipynb
+++ b/notebooks/ExploreData.ipynb
--- a/script/bootstrap
+++ b/script/bootstrap
@ -0,0 +1,15 @@
+#!/bin/sh
+
+set -e
+cd $(dirname "$0")/..
+
+docker build -f docker/preprocessing.Dockerfile -t preprocessing .
+
+# try to build container locally.  If fails pull from DockerHub.
+if docker build -f docker/docker-gpu.Dockerfile -t csnet:gpu . 2>/dev/null; then
+    echo "built image csnet:gpu locally."
+else
+    echo "local build failed, pulling image from DockerHub instead"
+    docker pull github/csnet:gpu
+    docker tag github/csnet:gpu csnet:gpu
+fi
--- a/script/console
+++ b/script/console
@ -0,0 +1,12 @@
+#!/bin/sh
+
+set -e
+cd $(dirname "$0")/..
+
+# detect if user is on new version of nvidia-docker and run the appropriate command
+if docker run --gpus all nvidia/cuda:9.0-base nvidia-smi 2>/dev/null; then
+    docker run --gpus all -it --net=host -v $(pwd):/home/dev csnet:gpu bash
+else
+    docker run --runtime=nvidia -it --net=host -v $(pwd):/home/dev csnet:gpu bash
+fi
+
--- a/script/download_and_preprocess
+++ b/script/download_and_preprocess
@ -0,0 +1,8 @@
+#!/bin/sh
+
+set -e
+
+cd /home/dev/script/
+rm -rf ../resources/data
+
+python download_dataset.py ../resources/data
--- a/script/download_dataset.py
+++ b/script/download_dataset.py
@ -0,0 +1,27 @@
+#!/usr/bin/env python
+"""
+Usage:
+    download_dataset.py DESTINATION_DIR
+
+Options:
+    -h --help   Show this screen.
+"""
+
+import os
+from subprocess import call
+
+from docopt import docopt
+
+
+if __name__ == '__main__':
+    args = docopt(__doc__)
+
+    destination_dir = os.path.abspath(args['DESTINATION_DIR'])
+    if not os.path.exists(destination_dir):
+        os.makedirs(destination_dir)
+    os.chdir(destination_dir)
+
+    for language in ('python', 'javascript', 'java', 'ruby', 'php', 'go'):
+        call(['wget', 'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/{}.zip'.format(language), '-P', destination_dir, '-O', '{}.zip'.format(language)])
+        call(['unzip', '{}.zip'.format(language)])
+        call(['rm', '{}.zip'.format(language)])
--- a/script/setup
+++ b/script/setup
@ -0,0 +1,7 @@
+#!/bin/sh
+
+set -e
+cd $(dirname "$0")/..
+
+script/bootstrap
+docker run -v $(pwd):/home/dev preprocessing
--- a/src/.gitignore
+++ b/src/.gitignore
@ -0,0 +1,7 @@
+azure_auth.json
+.idea
+*.pkl.gz
+*.pkl
+*.train_log
+.pt_description_history
+wandb-debug.log
--- a/src/init.py
+++ b/src/init.py
--- a/src/data_dirs_test.txt
+++ b/src/data_dirs_test.txt
@ -0,0 +1,6 @@
+../resources/data/python/final/jsonl/test
+../resources/data/javascript/final/jsonl/test
+../resources/data/java/final/jsonl/test
+../resources/data/php/final/jsonl/test
+../resources/data/ruby/final/jsonl/test
+../resources/data/go/final/jsonl/test
--- a/src/data_dirs_train.txt
+++ b/src/data_dirs_train.txt
@ -0,0 +1,6 @@
+../resources/data/python/final/jsonl/train
+../resources/data/javascript/final/jsonl/train
+../resources/data/java/final/jsonl/train
+../resources/data/php/final/jsonl/train
+../resources/data/ruby/final/jsonl/train
+../resources/data/go/final/jsonl/train
--- a/src/data_dirs_valid.txt
+++ b/src/data_dirs_valid.txt
@ -0,0 +1,6 @@
+../resources/data/python/final/jsonl/valid
+../resources/data/javascript/final/jsonl/valid
+../resources/data/java/final/jsonl/valid
+../resources/data/php/final/jsonl/valid
+../resources/data/ruby/final/jsonl/valid
+../resources/data/go/final/jsonl/valid
--- a/src/dataextraction/init.py
+++ b/src/dataextraction/init.py
@ -0,0 +1 @@
+
--- a/src/dataextraction/dedup_split.py
+++ b/src/dataextraction/dedup_split.py
@ -0,0 +1,143 @@
+#!/usr/bin/env python
+"""
+Remove near duplicates from data and perform train/test/validation/holdout splits.
+
+Usage:
+    dedup_split.py [options] INPUT_FILENAME OUTPUT_FOLDER
+
+Arguments:
+    INPUT_FOLDER               directory w/ compressed jsonl files that have a .jsonl.gz a file extenstion
+    OUTPUT_FOLDER              directory where you want to save data to.
+
+Options:
+    -h --help                    Show this screen.
+    --azure-info=<path>          Azure authentication information file (JSON).
+    --train-ratio FLOAT          Ratio of files for training set. [default: 0.6]
+    --valid-ratio FLOAT          Ratio of files for validation set. [default: 0.15]
+    --test-ratio FLOAT           Ratio of files for test set. [default: 0.15]
+    --holdout-ratio FLOAT        Ratio of files for test set. [default: 0.1]
+    --debug                      Enable debug routines. [default: False]
+
+Example:
+
+    python dedup_split.py \
+    --azure-info /ds/hamel/azure_auth.json \
+    azure://semanticcodesearch/pythondata/raw_v2  \
+    azure://semanticcodesearch/pythondata/Processed_Data_v2
+
+"""
+
+from docopt import docopt
+import hashlib
+import pandas as pd
+from utils.pkldf2jsonl import chunked_save_df_to_jsonl
+from dpu_utils.utils import RichPath, run_and_debug
+from dpu_utils.codeutils.deduplication import DuplicateDetector
+import os
+from tqdm import tqdm
+
+
+def jsonl_to_df(input_folder: RichPath) -> pd.DataFrame:
+    "Concatenates all jsonl files from path and returns them as a single pandas.DataFrame ."
+
+    assert input_folder.is_dir(), 'Argument supplied must be a directory'
+    dfs = []
+    files = list(input_folder.iterate_filtered_files_in_dir('*.jsonl.gz'))
+    assert files, 'There were no jsonl.gz files in the specified directory.'
+    print(f'reading files from {input_folder.path}')
+    for f in tqdm(files, total=len(files)):
+        dfs.append(pd.DataFrame(list(f.read_as_jsonl(error_handling=lambda m,e: print(f'Error while loading {m} : {e}')))))
+    return pd.concat(dfs)
+
+
+def remove_duplicate_code_df(df: pd.DataFrame) -> pd.DataFrame:
+    "Resolve near duplicates based upon code_tokens field in data."
+    assert 'code_tokens' in df.columns.values, 'Data must contain field code_tokens'
+    assert 'language' in df.columns.values, 'Data must contain field language'
+    df.reset_index(inplace=True, drop=True)
+    df['doc_id'] = df.index.values
+    dd = DuplicateDetector(min_num_tokens_per_document=10)
+    filter_mask = df.apply(lambda x: dd.add_file(id=x.doc_id,
+                                                 tokens=x.code_tokens,
+                                                 language=x.language),
+                           axis=1)
+    # compute fuzzy duplicates
+    exclusion_set = dd.compute_ids_to_exclude()
+    # compute pandas.series of type boolean which flags whether or not code should be discarded
+    # in order to resolve duplicates (discards all but one in each set of duplicate functions)
+    exclusion_mask = df['doc_id'].apply(lambda x: x not in exclusion_set)
+
+    # filter the data
+    print(f'Removed {sum(~(filter_mask & exclusion_mask)):,} fuzzy duplicates out of {df.shape[0]:,} rows.')
+    return df[filter_mask & exclusion_mask]
+
+
+def label_folds(df: pd.DataFrame, train_ratio: float, valid_ratio: float, test_ratio: float, holdout_ratio: float) -> pd.DataFrame:
+    "Adds a partition column to DataFrame with values: {train, valid, test, holdout}."
+    assert abs(train_ratio + valid_ratio + test_ratio + holdout_ratio - 1) < 1e-5,  'Ratios must sum up to 1.'
+    # code in the same file will always go to the same split
+    df['hash_key'] = df.apply(lambda x: f'{x.repo}:{x.path}', axis=1)
+    df['hash_val'] = df['hash_key'].apply(lambda x: int(hashlib.md5(x.encode()).hexdigest(), 16) % (2**16))
+
+    train_bound = int(2**16 * train_ratio)
+    valid_bound = train_bound + int(2**16 * valid_ratio)
+    test_bound = valid_bound + int(2**16 * test_ratio)
+
+    def label_splits(hash_val: int) -> str:
+        if hash_val <= train_bound:
+            return "train"
+        elif hash_val <= valid_bound:
+            return "valid"
+        elif hash_val <= test_bound:
+            return "test"
+        else:
+            return "holdout"
+
+    # apply partition logic
+    df['partition'] = df['hash_val'].apply(lambda x: label_splits(x))
+    # display summary statistics
+    counts = df.groupby('partition')['repo'].count().rename('count')
+    summary_df = pd.concat([counts, (counts / counts.sum()).rename('pct')], axis=1)
+    print(summary_df)
+
+    return df
+
+
+def run(args):
+
+    azure_info_path = args.get('--azure-info', None)
+    input_path = RichPath.create(args['INPUT_FILENAME'], azure_info_path)
+    output_folder = RichPath.create(args['OUTPUT_FOLDER'], azure_info_path)
+    train = float(args['--train-ratio'])
+    valid = float(args['--valid-ratio'])
+    test = float(args['--test-ratio'])
+    holdout = float(args['--holdout-ratio'])
+
+    # get data and process it
+    df = jsonl_to_df(input_path)
+    print('Removing fuzzy duplicates ... this may take some time.')
+    df = remove_duplicate_code_df(df)
+    df = df.sample(frac=1, random_state=20181026)  # shuffle order of files
+    df = label_folds(df, train_ratio=train, valid_ratio=valid, test_ratio=test, holdout_ratio=holdout)
+    splits = ['train', 'valid', 'test', 'holdout']
+
+    for split in splits:
+        split_df = df[df.partition == split]
+
+        # save dataframes as chunked jsonl files
+        jsonl_save_folder = output_folder.join(f'jsonl/{split}')
+        print(f'Uploading data to {str(jsonl_save_folder)}')
+        chunked_save_df_to_jsonl(split_df, jsonl_save_folder)
+
+        # Upload dataframes to Azure
+        filename = f'/tmp/{split}_df.pkl'
+        df_save_path = output_folder.join(f'DataFrame/{split}_df.pkl')
+        split_df.to_pickle(filename)
+        print(f'Uploading data to {str(df_save_path)}')
+        df_save_path.copy_from(RichPath.create(filename))
+        os.unlink(filename)
+
+
+if __name__ == '__main__':
+    args = docopt(__doc__)
+    run_and_debug(lambda: run(args), args.get('--debug'))
--- a/src/dataextraction/python/init.py
+++ b/src/dataextraction/python/init.py
--- a/src/dataextraction/python/parse_python_data.py
+++ b/src/dataextraction/python/parse_python_data.py
@ -0,0 +1,295 @@
+#!/usr/bin/env python
+"""
+Acquires python data from local disk or GCP and performs parsing, cleaning and tokenization steps
+to form a parallel corpus of (code, docstring) pairs with additional metadata.  Processed data
+is saved as multi-part jsonl files to the OUTPUT_PATH.
+
+Usage:
+    parse_python_data.py [options] OUTPUT_PATH
+
+Options:
+    -h --help                  Show this screen.
+    --input-folder=<path>      Use the given input folder instead of downloading.
+    --azure-info=<path>        Azure authentication information file (JSON). Used to load data from Azure storage
+    --debug                    Enable debug routines. [default: False]
+
+"""
+import re
+import os
+from multiprocessing import Pool
+from typing import List, NamedTuple
+
+import pandas as pd
+import parso
+from docopt import docopt
+from dpu_utils.utils import RichPath, run_and_debug
+from tqdm import tqdm
+
+from dataextraction.utils import tokenize_docstring_from_string
+from utils.pkldf2jsonl import chunked_save_df_to_jsonl
+
+
+IS_WHITESPACE_REGEX = re.compile(r'\s+')
+
+
+class ParsedCode(NamedTuple):
+    code_tokens: List[str]
+    comment_tokens: List[str]
+
+
+def tokenize_python_from_string(code: str,
+                                func_only: bool=True,
+                                report_errors: bool=False,
+                                only_ids: bool=False,
+                                add_keywords: bool=True) -> ParsedCode:
+    """
+    Tokenize Python code given a string.
+
+    Args:
+        code: The input code
+        func_only: if you want to only parse functions in code.
+        report_errors: Flag that turns on verbose error reporting
+        only_ids: Return only the identifiers within the code
+        add_keywords: Return keywords (used only when only_ids=True)
+
+    Returns:
+        Pair of lists. First list is sequence of code tokens; second list is sequence of tokens in comments.
+    """
+    try:
+        try:
+            parsed_ast = parso.parse(code, error_recovery=False, version="2.7")
+        except parso.parser.ParserSyntaxError:
+            parsed_ast = parso.parse(code, error_recovery=False, version="3.7")
+        code_tokens, comment_tokens = [], []
+
+        func_nodes = list(parsed_ast.iter_funcdefs())
+
+        # parse arbitrary snippets of code that are not functions if func_only = False
+        if not func_only:
+            func_nodes = [parsed_ast]
+        
+        for func_node in func_nodes:  # There should only be one, but we can process more...
+            doc_node = func_node.get_doc_node()
+            leaf_node = func_node.get_first_leaf()
+            while True:
+                # Skip over the docstring:
+                if leaf_node is doc_node:
+                    leaf_node = leaf_node.get_next_leaf()
+
+                # First, retrieve comment tokens:
+                for prefix in leaf_node._split_prefix():
+                    if prefix.type == 'comment':
+                        comment_text = prefix.value[1:]  # Split off the leading "#"
+                        comment_tokens.extend(tokenize_docstring_from_string(comment_text))
+
+                # Second, stop if we've reached the end:
+                if leaf_node.type == 'endmarker':
+                    break
+
+                # Third, record code tokens:
+                if not(IS_WHITESPACE_REGEX.match(leaf_node.value)):
+                    if only_ids:
+                        if leaf_node.type == 'name':
+                            code_tokens.append(leaf_node.value)
+                    else:
+                        if leaf_node.type == 'keyword':
+                            if add_keywords:
+                                code_tokens.append(leaf_node.value)
+                        else:
+                            code_tokens.append(leaf_node.value)
+                leaf_node = leaf_node.get_next_leaf()
+        return ParsedCode(code_tokens=code_tokens, comment_tokens=comment_tokens)
+    except Exception as e:
+        if report_errors:
+            print('Error tokenizing: %s' % (e,))
+        return ParsedCode(code_tokens=[], comment_tokens=[])
+
+
+def download_files_into_pandas(i: int=10) -> pd.DataFrame:
+    """Get files from Google Cloud Platform, there are 10 files.
+
+    Args:
+        i : int between 1 and 10 that specifies how many of the 10 files you
+            want to download.  You should only use this argument for testing.
+
+
+    Files are obtained by this query: https://console.cloud.google.com/bigquery?sq=235037502967:58a5d62f75f34d22b0f70d38b9352a85
+    """
+    frames = []
+    for i in tqdm(range(i), total=i):
+        success = False
+        while not success:
+            try:
+                frame = pd.read_csv(f'https://storage.googleapis.com/kubeflow-examples/code_search_new/python_raw_v2/00000000000{i}.csv', encoding='utf-8')
+                frames.append(frame)
+                success = True
+            except Exception as e:
+                print(f'Error downloading file {i}:\n {e}, retrying...')
+
+    df = pd.concat(frames)
+
+    df['repo'] = df['repo_path'].apply(lambda r: r.split()[0])
+    df['path'] = df['repo_path'].apply(lambda r: r.split()[1])
+    df.drop(columns=['repo_path'], inplace=True)
+    df = df[['repo', 'path', 'content']]
+    return df
+
+
+def load_files_into_pandas(input_folder: str) -> pd.DataFrame:
+    """Get files from a local directory.
+
+    Args:
+        input_folder: the folder containing the .csv files
+    """
+    frames = []
+    for file in os.listdir(input_folder):
+        if not file.endswith('.csv'):
+            continue
+        frame = pd.read_csv(os.path.join(input_folder, file), encoding='utf-8')
+        frames.append(frame)
+
+    df = pd.concat(frames)
+
+    df['repo'] = df['repo_path'].apply(lambda r: r.split()[0])
+    df['path'] = df['repo_path'].apply(lambda r: r.split()[1])
+    df.drop(columns=['repo_path'], inplace=True)
+    df = df[['repo', 'path', 'content']]
+    return df
+
+
+def parse_raw_data_into_function_list(blob, require_docstring: bool=True):
+    """Extract per-function data from a given code blob.
+
+    Filters out undesirable function types. Keep only the first line of the docstring, and remove all internal comments from
+    the code.
+
+    Args:
+        blob: String containing some python code.
+
+    Returns:
+        List of functions represented by dictionaries containing the code, docstring and metadata.
+    """
+    parsed_data_list = []
+    try:
+        try:
+            parsed_module = parso.parse(blob, error_recovery=False, version="2.7")
+        except parso.parser.ParserSyntaxError:
+            parsed_module = parso.parse(blob, error_recovery=False, version="3.7")
+
+        function_defs = list(parsed_module.iter_funcdefs())
+        for class_def in parsed_module.iter_classdefs():
+            function_defs.extend(class_def.iter_funcdefs())
+
+        for function_def in function_defs:
+            function_name = function_def.name.value
+            docstring_node = function_def.get_doc_node()
+            if docstring_node is None:
+                docstring = ''
+            else:
+                docstring = docstring_node.value
+            first_docstring_line = docstring.split('\n\s*\n')[0]
+
+            # We now need to un-indent the code which may have come from a class. For that, identify how far
+            # we are indented, and try to to remove that from all lines:
+            function_code = function_def.get_code()
+            def_prefix = list(function_def.get_first_leaf()._split_prefix())[-1].value
+            trimmed_lines = []
+            for line in function_code.splitlines():
+                if line.startswith(def_prefix):
+                    trimmed_lines.append(line[len(def_prefix):])
+            function_code = '\n'.join(trimmed_lines)
+
+            should_use_function = not (re.search(r'(__.+__)|(.*test.*)|(.*Test.*)', function_name) or  # skip __*__ methods and test code
+                                       re.search(r'NotImplementedException|@abstractmethod', function_code) or
+                                       len(function_code.split('\n')) <= 2 or  # should have more than 1 line of code (the declaration is one line)
+                                       (len(first_docstring_line.split()) <= 2) and require_docstring)  # docstring should have at least 3 words.
+
+            if should_use_function:
+                parsed_data_list.append({'code': function_code,
+                                         'docstring': first_docstring_line,
+                                         'language': 'python',
+                                         'lineno': function_def.start_pos[0],
+                                         'func_name': function_name,
+                                         })
+
+    except parso.parser.ParserSyntaxError:
+        pass
+    return parsed_data_list
+
+
+def listlen(x):
+    if not isinstance(x, list):
+        return 0
+    return len(x)
+
+
+def run(args):
+    azure_info_path = args.get('--azure-info')
+    output_folder = RichPath.create(args['OUTPUT_PATH'], azure_info_path)
+
+    # Download / read the data files:
+    if args['--input-folder'] is None:
+        print('Downloading data...')
+        raw_code_data_df = download_files_into_pandas()
+    else:
+        print('Loading data...')
+        raw_code_data_df = load_files_into_pandas(args['--input-folder'])
+    print('Data loaded.')
+
+    # Find all the functions and methods, filter out ones that don't meet requirements,
+    # separate the code from the docstring and produce a list of functions that includes the code,
+    # the first line of the docstring, and metadata of each:
+    with Pool() as pool:
+        function_data = pool.map(parse_raw_data_into_function_list, raw_code_data_df.content.tolist())
+    assert len(function_data) == raw_code_data_df.shape[0], \
+        f'Row count mismatch. `raw_code_data_df` has {raw_code_data_df.shape[0]} rows; `function_data` has {len(function_data)} rows.'
+    raw_code_data_df['function_data'] = function_data
+    print(f'Split {raw_code_data_df.shape[0]} blobs into {sum(len(fun_data) for fun_data in function_data)} documented individual functions.')
+
+    # Flatten function data out:
+    # TODO: We should also have access to the SHA of the objects here.
+    raw_code_data_df = raw_code_data_df.set_index(['repo', 'path'])['function_data'].apply(pd.Series).stack()
+    raw_code_data_df = raw_code_data_df.reset_index()
+    raw_code_data_df.columns = ['repo', 'path', '_', 'function_data']
+
+    # Extract meta-data and format dataframe.
+    function_data_df = pd.DataFrame(raw_code_data_df.function_data.values.tolist())
+    assert len(raw_code_data_df) == len(function_data_df), \
+        f'Row count mismatch. `raw_code_data_df` has {len(raw_code_data_df)} rows; `function_data_df` has {len(function_data_df)} rows.'
+    function_data_df = pd.concat([raw_code_data_df[['repo', 'path']], function_data_df], axis=1)
+
+    # remove observations where the same code appears more than once
+    num_before_dedup = len(function_data_df)
+    function_data_df = function_data_df.drop_duplicates(['code'])
+    num_after_dedup = len(function_data_df)
+
+    print(f'Removed {num_before_dedup - num_after_dedup} exact duplicate rows.')
+
+    print('Tokenizing code, comments and docstrings ...')
+    with Pool() as pool:
+        code_tokenization_results: List[ParsedCode] = pool.map(tokenize_python_from_string,
+                                                               function_data_df['code'].tolist())
+
+        code_tokens_list, comment_tokens_list = list(zip(*code_tokenization_results))
+        function_data_df['code_tokens'] = code_tokens_list
+        function_data_df['comment_tokens'] = comment_tokens_list
+        function_data_df['docstring_tokens'] = pool.map(tokenize_docstring_from_string,
+                                                        function_data_df['docstring'].tolist())
+    function_data_df.dropna(subset=['code_tokens', 'comment_tokens', 'docstring_tokens'], inplace=True)
+    function_data_df.reset_index(inplace=True, drop=True)
+
+    cols_to_keep = ['repo', 'path', 'lineno', 'func_name', 'language',
+                    'code', 'code_tokens', 'comment_tokens',
+                    'docstring', 'docstring_tokens',
+                    ]
+    # write data to jsonl
+    print(f'Count by language:\n{function_data_df.language.value_counts()}')
+    chunked_save_df_to_jsonl(df=function_data_df[cols_to_keep],
+                             output_folder=output_folder,
+                             parallel=True)
+    print(f'Wrote {function_data_df.shape[0]} rows to {str(output_folder)}.')
+
+
+if __name__ == '__main__':
+    args = docopt(__doc__)
+    run_and_debug(lambda: run(args), args.get('--debug'))
--- a/src/dataextraction/utils.py
+++ b/src/dataextraction/utils.py
@ -0,0 +1,8 @@
+import re
+from typing import List
+
+DOCSTRING_REGEX_TOKENIZER = re.compile(r"[^\s,'\"`.():\[\]=*;>{\}+-/\\]+|\\+|\.+|\(\)|{\}|\[\]|\(+|\)+|:+|\[+|\]+|{+|\}+|=+|\*+|;+|>+|\++|-+|/+")
+
+
+def tokenize_docstring_from_string(docstr: str) -> List[str]:
+    return [t for t in DOCSTRING_REGEX_TOKENIZER.findall(docstr) if t is not None and len(t) > 0]
--- a/src/docs/THIRD_PARTY_NOTICE.md
+++ b/src/docs/THIRD_PARTY_NOTICE.md
@ -0,0 +1,19 @@
+# Third Party Notices and Information
+
+Container images built with this project include third party materials; see below for license and other copyright information.
+
+Certain open source code is available in container images, or online as noted below, or you may send a request for source code including identification of the container, the open source component name, and version number, to: `opensource@github.com`.
+
+Notwithstanding any other terms, you may reverse engineer this software to the extent required to debug changes to any libraries licensed under the GNU Lesser General Public License for your own use.
+
+## Ubuntu packages
+
+License and other copyright information for each package is included in the image at `/usr/share/doc/{package}/copyright`.
+
+Source for each package is available at `https://packages.ubuntu.com/source/{package}`.
+
+## Python packages
+
+License and other copyright information for each package is included in the image under `/usr/local/lib/python{version}/site-packages/{package}`.
+
+Additional information for each package is available at `https://pypi.org/project/{package}`.
--- a/src/encoders/init.py
+++ b/src/encoders/init.py
@ -0,0 +1,6 @@
+from .encoder import Encoder, QueryType
+from .nbow_seq_encoder import NBoWEncoder
+from .rnn_seq_encoder import RNNEncoder
+from .self_att_encoder import SelfAttentionEncoder
+from .conv_seq_encoder import ConvolutionSeqEncoder
+from .conv_self_att_encoder import ConvSelfAttentionEncoder
--- a/src/encoders/conv_self_att_encoder.py
+++ b/src/encoders/conv_self_att_encoder.py
@ -0,0 +1,85 @@
+from typing import Dict, Any
+
+import tensorflow as tf
+
+from .utils.bert_self_attention import BertConfig, BertModel
+from .masked_seq_encoder import MaskedSeqEncoder
+from utils.tfutils import get_activation, pool_sequence_embedding
+
+
+class ConvSelfAttentionEncoder(MaskedSeqEncoder):
+    @classmethod
+    def get_default_hyperparameters(cls) -> Dict[str, Any]:
+        encoder_hypers = {'1dcnn_position_encoding': 'none',  # One of {'none', 'learned'}
+                          '1dcnn_layer_list': [128, 128],
+                          '1dcnn_kernel_width': [8, 8],  # Has to have same length as 1dcnn_layer_list
+                          '1dcnn_add_residual_connections': True,
+                          '1dcnn_activation': 'tanh',
+
+                          'self_attention_activation': 'gelu',
+                          'self_attention_hidden_size': 128,
+                          'self_attention_intermediate_size': 512,
+                          'self_attention_num_layers': 2,
+                          'self_attention_num_heads': 8,
+                          'self_attention_pool_mode': 'weighted_mean',
+                          }
+        hypers = super().get_default_hyperparameters()
+        hypers.update(encoder_hypers)
+        return hypers
+
+    def __init__(self, label: str, hyperparameters: Dict[str, Any], metadata: Dict[str, Any]):
+        super().__init__(label, hyperparameters, metadata)
+
+    @property
+    def output_representation_size(self):
+        return self.get_hyper('self_attention_hidden_size')
+
+    def make_model(self, is_train: bool = False) -> tf.Tensor:
+        with tf.variable_scope("self_attention_encoder"):
+            self._make_placeholders()
+
+            seq_tokens_embeddings = self.embedding_layer(self.placeholders['tokens'])
+
+            activation_fun = get_activation(self.get_hyper('1dcnn_activation'))
+            current_embeddings = seq_tokens_embeddings
+            num_filters_and_width = zip(self.get_hyper('1dcnn_layer_list'), self.get_hyper('1dcnn_kernel_width'))
+            for (layer_idx, (num_filters, kernel_width)) in enumerate(num_filters_and_width):
+                next_embeddings = tf.layers.conv1d(
+                    inputs=current_embeddings,
+                    filters=num_filters,
+                    kernel_size=kernel_width,
+                    padding="same")
+
+                # Add residual connections past the first layer.
+                if self.get_hyper('1dcnn_add_residual_connections') and layer_idx > 0:
+                    next_embeddings += current_embeddings
+
+                current_embeddings = activation_fun(next_embeddings)
+
+                current_embeddings = tf.nn.dropout(current_embeddings,
+                                                   keep_prob=self.placeholders['dropout_keep_rate'])
+
+            config = BertConfig(vocab_size=self.get_hyper('token_vocab_size'),
+                                hidden_size=self.get_hyper('self_attention_hidden_size'),
+                                num_hidden_layers=self.get_hyper('self_attention_num_layers'),
+                                num_attention_heads=self.get_hyper('self_attention_num_heads'),
+                                intermediate_size=self.get_hyper('self_attention_intermediate_size'))
+
+            model = BertModel(config=config,
+                              is_training=is_train,
+                              input_ids=self.placeholders['tokens'],
+                              input_mask=self.placeholders['tokens_mask'],
+                              use_one_hot_embeddings=False,
+                              embedded_input=current_embeddings)
+
+            output_pool_mode = self.get_hyper('self_attention_pool_mode').lower()
+            if output_pool_mode == 'bert':
+                return model.get_pooled_output()
+            else:
+                seq_token_embeddings = model.get_sequence_output()
+                seq_token_masks = self.placeholders['tokens_mask']
+                seq_token_lengths = tf.reduce_sum(seq_token_masks, axis=1)  # B
+                return pool_sequence_embedding(output_pool_mode,
+                                               sequence_token_embeddings=seq_token_embeddings,
+                                               sequence_lengths=seq_token_lengths,
+                                               sequence_token_masks=seq_token_masks)
--- a/src/encoders/conv_seq_encoder.py
+++ b/src/encoders/conv_seq_encoder.py
@ -0,0 +1,77 @@
+from typing import Dict, Any
+
+import tensorflow as tf
+
+from .masked_seq_encoder import MaskedSeqEncoder
+from utils.tfutils import get_activation, pool_sequence_embedding
+
+
+class ConvolutionSeqEncoder(MaskedSeqEncoder):
+    @classmethod
+    def get_default_hyperparameters(cls) -> Dict[str, Any]:
+        encoder_hypers = {'1dcnn_position_encoding': 'learned',  # One of {'none', 'learned'}
+                          '1dcnn_layer_list': [128, 128, 128],
+                          '1dcnn_kernel_width': [16, 16, 16],  # Has to have same length as 1dcnn_layer_list
+                          '1dcnn_add_residual_connections': True,
+                          '1dcnn_activation': 'tanh',
+                          '1dcnn_pool_mode': 'weighted_mean',
+                         }
+        hypers = super().get_default_hyperparameters()
+        hypers.update(encoder_hypers)
+        return hypers
+
+    def __init__(self, label: str, hyperparameters: Dict[str, Any], metadata: Dict[str, Any]):
+        super().__init__(label, hyperparameters, metadata)
+
+    @property
+    def output_representation_size(self):
+        return self.get_hyper('1dcnn_layer_list')[-1]
+
+    def make_model(self, is_train: bool=False) -> tf.Tensor:
+        with tf.variable_scope("1dcnn_encoder"):
+            self._make_placeholders()
+
+            seq_tokens_embeddings = self.embedding_layer(self.placeholders['tokens'])
+            seq_tokens_embeddings = self.__add_position_encoding(seq_tokens_embeddings)
+
+            activation_fun = get_activation(self.get_hyper('1dcnn_activation'))
+            current_embeddings = seq_tokens_embeddings
+            num_filters_and_width = zip(self.get_hyper('1dcnn_layer_list'), self.get_hyper('1dcnn_kernel_width'))
+            for (layer_idx, (num_filters, kernel_width)) in enumerate(num_filters_and_width):
+                next_embeddings = tf.layers.conv1d(
+                    inputs=current_embeddings,
+                    filters=num_filters,
+                    kernel_size=kernel_width,
+                    padding="same")
+
+                # Add residual connections past the first layer.
+                if self.get_hyper('1dcnn_add_residual_connections') and layer_idx > 0:
+                    next_embeddings += current_embeddings
+
+                current_embeddings = activation_fun(next_embeddings)
+
+                current_embeddings = tf.nn.dropout(current_embeddings,
+                                                   keep_prob=self.placeholders['dropout_keep_rate'])
+
+            seq_token_mask = self.placeholders['tokens_mask']
+            seq_token_lengths = tf.reduce_sum(seq_token_mask, axis=1)  # B
+            return pool_sequence_embedding(self.get_hyper('1dcnn_pool_mode').lower(),
+                                           sequence_token_embeddings=current_embeddings,
+                                           sequence_lengths=seq_token_lengths,
+                                           sequence_token_masks=seq_token_mask)
+
+    def __add_position_encoding(self, seq_inputs: tf.Tensor) -> tf.Tensor:
+        position_encoding = self.get_hyper('1dcnn_position_encoding').lower()
+        if position_encoding == 'none':
+            return seq_inputs
+        elif position_encoding == 'learned':
+            position_embeddings = \
+                tf.get_variable(name='position_embeddings',
+                                initializer=tf.truncated_normal_initializer(stddev=0.02),
+                                shape=[self.get_hyper('max_num_tokens'),
+                                       self.get_hyper('token_embedding_size')],
+                                )
+            # Add batch dimension to position embeddings to make broadcasting work, then add:
+            return seq_inputs + tf.expand_dims(position_embeddings, axis=0)
+        else:
+            raise ValueError("Unknown position encoding '%s'!" % position_encoding)
--- a/src/encoders/encoder.py
+++ b/src/encoders/encoder.py
@ -0,0 +1,208 @@
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import List, Dict, Any, Optional, Tuple
+
+import tensorflow as tf
+
+
+class QueryType(Enum):
+    DOCSTRING = 'docstring_as_query'
+    FUNCTION_NAME = 'func_name_as_query'
+
+
+class Encoder(ABC):
+    @classmethod
+    @abstractmethod
+    def get_default_hyperparameters(cls) -> Dict[str, Any]:
+        """
+        Returns:
+             Default set of hyperparameters for encoder.
+             Note that at use, the hyperparameters names will be prefixed with '${label}_' for the
+             chosen encoder label.
+        """
+        return {}
+
+    def __init__(self, label: str, hyperparameters: Dict[str, Any], metadata: Dict[str, Any]):
+        """
+        Args:
+            label: Label for the encoder, used in names of hyperparameters.
+            hyperparameters: Hyperparameters used.
+            metadata: Dictionary with metadata (e.g., vocabularies) used by this encoder.
+        """
+        self.__label = label
+        self.__hyperparameters = hyperparameters
+        self.__metadata = metadata
+        self.__placeholders = {}
+
+    @property
+    def label(self):
+        return self.__label
+
+    @property
+    def hyperparameters(self):
+        return self.__hyperparameters
+
+    @property
+    def metadata(self):
+        return self.__metadata
+
+    @property
+    def placeholders(self):
+        return self.__placeholders
+
+    @property
+    @abstractmethod
+    def output_representation_size(self) -> int:
+        raise Exception("Encoder.output_representation_size not implemented!")
+
+    def get_hyper(self, hyper_name: str) -> Any:
+        """
+        Retrieve hyper parameter, prefixing the given name with the label of the encoder.
+
+        Args:
+            hyper_name: Some hyperparameter name.
+
+        Returns:
+            self.hyperparameters['%s_%s' % (self.label, hyper_name)]
+        """
+        return self.hyperparameters['%s_%s' % (self.label, hyper_name)]
+
+    def _make_placeholders(self):
+        """
+        Creates placeholders for encoders.
+        """
+        self.__placeholders['dropout_keep_rate'] = \
+            tf.placeholder(tf.float32,
+                           shape=(),
+                           name='dropout_keep_rate')
+
+    @abstractmethod
+    def make_model(self, is_train: bool=False) -> tf.Tensor:
+        """
+        Create the actual encoder model, including necessary placeholders and parameters.
+
+        Args:
+            is_train: Bool flag indicating if the model is used for training or inference.
+
+        Returns:
+            A tensor encoding the passed data.
+        """
+        pass
+
+    @classmethod
+    @abstractmethod
+    def init_metadata(cls) -> Dict[str, Any]:
+        """
+        Called to initialise the metadata before looking at actual data (i.e., set up Counters, lists, sets, ...)
+
+        Returns:
+            A dictionary that will be used to collect the raw metadata (token counts, ...).
+        """
+        return {}
+
+    @classmethod
+    @abstractmethod
+    def load_metadata_from_sample(cls, data_to_load: Any, raw_metadata: Dict[str, Any],
+                                  use_subtokens: bool=False, mark_subtoken_end: bool=False) -> None:
+        """
+        Called to load metadata from a single sample.
+
+        Args:
+            data_to_load: Raw data to load; type depens on encoder. Usually comes from a data parser such as
+             tokenize_python_from_string or tokenize_docstring_from_string.
+            raw_metadata: A dictionary that will be used to collect the raw metadata (token counts, ...).
+            use_subtokens: subtokenize identifiers
+            mark_subtoken_end: add a special marker for subtoken ends. Used only if use_subtokens=True
+        """
+        pass
+
+    @classmethod
+    @abstractmethod
+    def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Called to finalise the metadata after looking at actual data (i.e., compute vocabularies, ...)
+
+        Args:
+            encoder_label: Label used for this encoder.
+            hyperparameters: Hyperparameters used.
+            raw_metadata_list: List of dictionaries used to collect the raw metadata (token counts, ...) (one per file).
+
+        Returns:
+            Finalised metadata (vocabs, ...).
+        """
+        return {}
+
+    @classmethod
+    @abstractmethod
+    def load_data_from_sample(cls,
+                              encoder_label: str,
+                              hyperparameters: Dict[str, Any],
+                              metadata: Dict[str, Any],
+                              data_to_load: Any,
+                              function_name: Optional[str],
+                              result_holder: Dict[str, Any],
+                              is_test: bool=True) -> bool:
+        """
+        Called to convert a raw sample into the internal format, allowing for preprocessing.
+        Result will eventually be fed again into the split_data_into_minibatches pipeline.
+
+        Args:
+            encoder_label: Label used for this encoder.
+            hyperparameters: Hyperparameters used to load data.
+            metadata: Computed metadata (e.g. vocabularies).
+            data_to_load: Raw data to load; type depens on encoder. Usually comes from a data parser such as
+             tokenize_python_from_string or tokenize_docstring_from_string.
+             function_name: The name of the function.
+            result_holder: Dictionary used to hold the prepared data.
+            is_test: Flag marking if we are handling training data or not.
+
+        Returns:
+            Flag indicating if the example should be used (True) or dropped (False)
+        """
+        return True
+
+    @abstractmethod
+    def init_minibatch(self, batch_data: Dict[str, Any]) -> None:
+        """
+        Initialise a minibatch that will be constructed.
+
+        Args:
+            batch_data: The minibatch data.
+        """
+        pass
+
+    @abstractmethod
+    def extend_minibatch_by_sample(self, batch_data: Dict[str, Any], sample: Dict[str, Any], is_train: bool=False,
+                                   query_type: QueryType=QueryType.DOCSTRING.value) -> bool:
+        """
+        Extend a minibatch under construction by one sample. This is where the data may be randomly perturbed in each
+        epoch for data augmentation.
+
+        Args:
+            batch_data: The minibatch data.
+            sample: The sample to add.
+            is_train: Flag indicating if we are in train mode (which causes data augmentation)
+            query_type: Indicates what should be used as the query, the docstring or the function name.
+
+        Returns:
+            True iff the minibatch is full after this sample.
+        """
+        return True
+
+    @abstractmethod
+    def minibatch_to_feed_dict(self, batch_data: Dict[str, Any], feed_dict: Dict[tf.Tensor, Any], is_train: bool) -> None:
+        """
+        Take a collected minibatch and add it to a feed dict that can be fed directly to the constructed model.
+
+        Args:
+            batch_data: The minibatch data.
+            feed_dict: The feed dictionary that we will send to tensorflow.
+            is_train: Flag indicating if we are in training mode.
+        """
+        feed_dict[self.placeholders['dropout_keep_rate']] = self.hyperparameters['dropout_keep_rate'] if is_train else 1.0
+
+    @abstractmethod
+    def get_token_embeddings(self) -> Tuple[tf.Tensor, List[str]]:
+        """Returns the tensorflow embeddings tensor (VxD) along with a list (of size V) of the names of the
+        embedded elements."""
+        pass
--- a/src/encoders/masked_seq_encoder.py
+++ b/src/encoders/masked_seq_encoder.py
@ -0,0 +1,40 @@
+from typing import Dict, Any, Iterable, Optional
+
+import numpy as np
+import tensorflow as tf
+
+from .seq_encoder import SeqEncoder
+from utils.tfutils import write_to_feed_dict, pool_sequence_embedding
+
+
+class MaskedSeqEncoder(SeqEncoder):
+    @classmethod
+    def get_default_hyperparameters(cls) -> Dict[str, Any]:
+        encoder_hypers = {
+                         }
+        hypers = super().get_default_hyperparameters()
+        hypers.update(encoder_hypers)
+        return hypers
+
+    def __init__(self, label: str, hyperparameters: Dict[str, Any], metadata: Dict[str, Any]):
+        super().__init__(label, hyperparameters, metadata)
+
+    def _make_placeholders(self):
+        """
+        Creates placeholders "tokens" and "tokens_mask" for masked sequence encoders.
+        """
+        super()._make_placeholders()
+        self.placeholders['tokens_mask'] = \
+            tf.placeholder(tf.float32,
+                           shape=[None, self.get_hyper('max_num_tokens')],
+                           name='tokens_mask')
+
+    def init_minibatch(self, batch_data: Dict[str, Any]) -> None:
+        super().init_minibatch(batch_data)
+        batch_data['tokens'] = []
+        batch_data['tokens_mask'] = []
+
+    def minibatch_to_feed_dict(self, batch_data: Dict[str, Any], feed_dict: Dict[tf.Tensor, Any], is_train: bool) -> None:
+        super().minibatch_to_feed_dict(batch_data, feed_dict, is_train)
+        write_to_feed_dict(feed_dict, self.placeholders['tokens'], batch_data['tokens'])
+        write_to_feed_dict(feed_dict, self.placeholders['tokens_mask'], batch_data['tokens_mask'])
--- a/src/encoders/nbow_seq_encoder.py
+++ b/src/encoders/nbow_seq_encoder.py
@ -0,0 +1,35 @@
+from typing import Dict, Any
+
+import tensorflow as tf
+
+from .masked_seq_encoder import MaskedSeqEncoder
+from utils.tfutils import pool_sequence_embedding
+
+
+class NBoWEncoder(MaskedSeqEncoder):
+    @classmethod
+    def get_default_hyperparameters(cls) -> Dict[str, Any]:
+        encoder_hypers = { 'nbow_pool_mode': 'weighted_mean',
+                         }
+        hypers = super().get_default_hyperparameters()
+        hypers.update(encoder_hypers)
+        return hypers
+
+    def __init__(self, label: str, hyperparameters: Dict[str, Any], metadata: Dict[str, Any]):
+        super().__init__(label, hyperparameters, metadata)
+
+    @property
+    def output_representation_size(self):
+        return self.get_hyper('token_embedding_size')
+
+    def make_model(self, is_train: bool=False) -> tf.Tensor:
+        with tf.variable_scope("nbow_encoder"):
+            self._make_placeholders()
+
+            seq_tokens_embeddings = self.embedding_layer(self.placeholders['tokens'])
+            seq_token_mask = self.placeholders['tokens_mask']
+            seq_token_lengths = tf.reduce_sum(seq_token_mask, axis=1)  # B
+            return pool_sequence_embedding(self.get_hyper('nbow_pool_mode').lower(),
+                                           sequence_token_embeddings=seq_tokens_embeddings,
+                                           sequence_lengths=seq_token_lengths,
+                                           sequence_token_masks=seq_token_mask)
--- a/src/encoders/rnn_seq_encoder.py
+++ b/src/encoders/rnn_seq_encoder.py
@ -0,0 +1,197 @@
+from typing import Dict, Any, Union, Tuple
+
+import tensorflow as tf
+
+from .seq_encoder import SeqEncoder
+from utils.tfutils import write_to_feed_dict, pool_sequence_embedding
+
+
+def __make_rnn_cell(cell_type: str,
+                    hidden_size: int,
+                    dropout_keep_rate: Union[float, tf.Tensor]=1.0,
+                    recurrent_dropout_keep_rate: Union[float, tf.Tensor]=1.0) \
+        -> tf.nn.rnn_cell.RNNCell:
+    """
+    Args:
+        cell_type: "lstm", "gru", or 'rnn' (any casing)
+        hidden_size: size for the underlying recurrent unit
+        dropout_keep_rate: output-vector dropout prob
+        recurrent_dropout_keep_rate:  state-vector dropout prob
+
+    Returns:
+        RNNCell of the desired type.
+    """
+    cell_type = cell_type.lower()
+    if cell_type == 'lstm':
+        cell = tf.nn.rnn_cell.LSTMCell(hidden_size)
+    elif cell_type == 'gru':
+        cell = tf.nn.rnn_cell.GRUCell(hidden_size)
+    elif cell_type == 'rnn':
+        cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+    else:
+        raise ValueError("Unknown RNN cell type '%s'!" % cell_type)
+
+    return tf.contrib.rnn.DropoutWrapper(cell,
+                                         output_keep_prob=dropout_keep_rate,
+                                         state_keep_prob=recurrent_dropout_keep_rate)
+
+
+def _make_deep_rnn_cell(num_layers: int,
+                        cell_type: str,
+                        hidden_size: int,
+                        dropout_keep_rate: Union[float, tf.Tensor]=1.0,
+                        recurrent_dropout_keep_rate: Union[float, tf.Tensor]=1.0) \
+        -> tf.nn.rnn_cell.RNNCell:
+    """
+    Args:
+        num_layers: number of layers in result
+        cell_type: "lstm" or "gru" (any casing)
+        hidden_size: size for the underlying recurrent unit
+        dropout_keep_rate: output-vector dropout prob
+        recurrent_dropout_keep_rate: state-vector dropout prob
+
+    Returns:
+        (Multi)RNNCell of the desired type.
+    """
+    if num_layers == 1:
+        return __make_rnn_cell(cell_type, hidden_size, dropout_keep_rate, recurrent_dropout_keep_rate)
+    else:
+        cells = [__make_rnn_cell(cell_type, hidden_size, dropout_keep_rate, recurrent_dropout_keep_rate)
+                 for _ in range(num_layers)]
+        return tf.nn.rnn_cell.MultiRNNCell(cells)
+
+
+class RNNEncoder(SeqEncoder):
+    @classmethod
+    def get_default_hyperparameters(cls) -> Dict[str, Any]:
+        encoder_hypers = {'rnn_num_layers': 2,
+                          'rnn_hidden_dim': 64,
+                          'rnn_cell_type': 'LSTM',  # One of [LSTM, GRU, RNN]
+                          'rnn_is_bidirectional': True,
+                          'rnn_dropout_keep_rate': 0.8,
+                          'rnn_recurrent_dropout_keep_rate': 1.0,
+                          'rnn_pool_mode': 'weighted_mean',
+                          }
+        hypers = super().get_default_hyperparameters()
+        hypers.update(encoder_hypers)
+        return hypers
+
+    def __init__(self, label: str, hyperparameters: Dict[str, Any], metadata: Dict[str, Any]):
+        super().__init__(label, hyperparameters, metadata)
+
+    @property
+    def output_representation_size(self):
+        if self.get_hyper('rnn_is_bidirectional'):
+            return 2 * self.get_hyper('rnn_hidden_dim')
+        else:
+            return self.get_hyper('rnn_hidden_dim')
+
+    def _encode_with_rnn(self,
+                         inputs: tf.Tensor,
+                         input_lengths: tf.Tensor) \
+            -> Tuple[tf.Tensor, tf.Tensor]:
+        cell_type = self.get_hyper('rnn_cell_type').lower()
+        rnn_cell_fwd = _make_deep_rnn_cell(num_layers=self.get_hyper('rnn_num_layers'),
+                                           cell_type=cell_type,
+                                           hidden_size=self.get_hyper('rnn_hidden_dim'),
+                                           dropout_keep_rate=self.placeholders['rnn_dropout_keep_rate'],
+                                           recurrent_dropout_keep_rate=self.placeholders['rnn_recurrent_dropout_keep_rate'],
+                                           )
+        if not self.get_hyper('rnn_is_bidirectional'):
+            (outputs, final_states) = tf.nn.dynamic_rnn(cell=rnn_cell_fwd,
+                                                        inputs=inputs,
+                                                        sequence_length=input_lengths,
+                                                        dtype=tf.float32,
+                                                        )
+
+            if cell_type == 'lstm':
+                final_state = tf.concat([tf.concat(layer_final_state, axis=-1)  # concat c & m of LSTM cell
+                                         for layer_final_state in final_states],
+                                        axis=-1)  # concat across layers
+            elif cell_type == 'gru' or cell_type == 'rnn':
+                final_state = tf.concat(final_states, axis=-1)
+            else:
+                raise ValueError("Unknown RNN cell type '%s'!" % cell_type)
+        else:
+            rnn_cell_bwd = _make_deep_rnn_cell(num_layers=self.get_hyper('rnn_num_layers'),
+                                               cell_type=cell_type,
+                                               hidden_size=self.get_hyper('rnn_hidden_dim'),
+                                               dropout_keep_rate=self.placeholders['rnn_dropout_keep_rate'],
+                                               recurrent_dropout_keep_rate=self.placeholders['rnn_recurrent_dropout_keep_rate'],
+                                               )
+
+            (outputs, final_states) = tf.nn.bidirectional_dynamic_rnn(cell_fw=rnn_cell_fwd,
+                                                                      cell_bw=rnn_cell_bwd,
+                                                                      inputs=inputs,
+                                                                      sequence_length=input_lengths,
+                                                                      dtype=tf.float32,
+                                                                      )
+            # Merge fwd/bwd outputs:
+            if cell_type == 'lstm':
+                final_state = tf.concat([tf.concat([tf.concat(layer_final_state, axis=-1)  # concat c & m of LSTM cell
+                                                    for layer_final_state in layer_final_states],
+                                                   axis=-1)  # concat across layers
+                                        for layer_final_states in final_states],
+                                        axis=-1)  # concat fwd & bwd
+            elif cell_type == 'gru' or cell_type == 'rnn':
+                final_state = tf.concat([tf.concat(layer_final_states, axis=-1)  # concat across layers
+                                         for layer_final_states in final_states],
+                                        axis=-1)  # concat fwd & bwd
+            else:
+                raise ValueError("Unknown RNN cell type '%s'!" % cell_type)
+            outputs = tf.concat(outputs, axis=-1)  # concat fwd & bwd
+
+        return final_state, outputs
+
+    def make_model(self, is_train: bool=False) -> tf.Tensor:
+        with tf.variable_scope("rnn_encoder"):
+            self._make_placeholders()
+
+            self.placeholders['tokens_lengths'] = \
+                tf.placeholder(tf.int32,
+                               shape=[None],
+                               name='tokens_lengths')
+
+            self.placeholders['rnn_dropout_keep_rate'] = \
+                tf.placeholder(tf.float32,
+                               shape=[],
+                               name='rnn_dropout_keep_rate')
+
+            self.placeholders['rnn_recurrent_dropout_keep_rate'] = \
+                tf.placeholder(tf.float32,
+                               shape=[],
+                               name='rnn_recurrent_dropout_keep_rate')
+
+            seq_tokens = self.placeholders['tokens']
+            seq_tokens_embeddings = self.embedding_layer(seq_tokens)
+            seq_tokens_lengths = self.placeholders['tokens_lengths']
+
+            rnn_final_state, token_embeddings = self._encode_with_rnn(seq_tokens_embeddings, seq_tokens_lengths)
+
+            output_pool_mode = self.get_hyper('rnn_pool_mode').lower()
+            if output_pool_mode == 'rnn_final':
+                return rnn_final_state
+            else:
+                token_mask = tf.expand_dims(tf.range(tf.shape(seq_tokens)[1]), axis=0)            # 1 x T
+                token_mask = tf.tile(token_mask, multiples=(tf.shape(seq_tokens_lengths)[0], 1))  # B x T
+                token_mask = tf.cast(token_mask < tf.expand_dims(seq_tokens_lengths, axis=-1),
+                                     dtype=tf.float32)                                            # B x T
+                return pool_sequence_embedding(output_pool_mode,
+                                               sequence_token_embeddings=token_embeddings,
+                                               sequence_lengths=seq_tokens_lengths,
+                                               sequence_token_masks=token_mask)
+
+    def init_minibatch(self, batch_data: Dict[str, Any]) -> None:
+        super().init_minibatch(batch_data)
+        batch_data['tokens'] = []
+        batch_data['tokens_lengths'] = []
+
+    def minibatch_to_feed_dict(self, batch_data: Dict[str, Any], feed_dict: Dict[tf.Tensor, Any], is_train: bool) -> None:
+        super().minibatch_to_feed_dict(batch_data, feed_dict, is_train)
+        feed_dict[self.placeholders['rnn_dropout_keep_rate']] = \
+            self.get_hyper('rnn_dropout_keep_rate') if is_train else 1.0
+        feed_dict[self.placeholders['rnn_recurrent_dropout_keep_rate']] = \
+            self.get_hyper('rnn_recurrent_dropout_keep_rate') if is_train else 1.0
+
+        write_to_feed_dict(feed_dict, self.placeholders['tokens'], batch_data['tokens'])
+        write_to_feed_dict(feed_dict, self.placeholders['tokens_lengths'], batch_data['tokens_lengths'])
--- a/src/encoders/self_att_encoder.py
+++ b/src/encoders/self_att_encoder.py
@ -0,0 +1,57 @@
+from typing import Dict, Any
+
+import tensorflow as tf
+
+from .utils.bert_self_attention import BertConfig, BertModel
+from .masked_seq_encoder import MaskedSeqEncoder
+from utils.tfutils import pool_sequence_embedding
+
+
+class SelfAttentionEncoder(MaskedSeqEncoder):
+    @classmethod
+    def get_default_hyperparameters(cls) -> Dict[str, Any]:
+        encoder_hypers = {'self_attention_activation': 'gelu',
+                          'self_attention_hidden_size': 128,
+                          'self_attention_intermediate_size': 512,
+                          'self_attention_num_layers': 3,
+                          'self_attention_num_heads': 8,
+                          'self_attention_pool_mode': 'weighted_mean',
+                          }
+        hypers = super().get_default_hyperparameters()
+        hypers.update(encoder_hypers)
+        return hypers
+
+    def __init__(self, label: str, hyperparameters: Dict[str, Any], metadata: Dict[str, Any]):
+        super().__init__(label, hyperparameters, metadata)
+
+    @property
+    def output_representation_size(self):
+        return self.get_hyper('self_attention_hidden_size')
+
+    def make_model(self, is_train: bool = False) -> tf.Tensor:
+        with tf.variable_scope("self_attention_encoder"):
+            self._make_placeholders()
+
+            config = BertConfig(vocab_size=self.get_hyper('token_vocab_size'),
+                                hidden_size=self.get_hyper('self_attention_hidden_size'),
+                                num_hidden_layers=self.get_hyper('self_attention_num_layers'),
+                                num_attention_heads=self.get_hyper('self_attention_num_heads'),
+                                intermediate_size=self.get_hyper('self_attention_intermediate_size'))
+
+            model = BertModel(config=config,
+                              is_training=is_train,
+                              input_ids=self.placeholders['tokens'],
+                              input_mask=self.placeholders['tokens_mask'],
+                              use_one_hot_embeddings=False)
+
+            output_pool_mode = self.get_hyper('self_attention_pool_mode').lower()
+            if output_pool_mode == 'bert':
+                return model.get_pooled_output()
+            else:
+                seq_token_embeddings = model.get_sequence_output()
+                seq_token_masks = self.placeholders['tokens_mask']
+                seq_token_lengths = tf.reduce_sum(seq_token_masks, axis=1)  # B
+                return pool_sequence_embedding(output_pool_mode,
+                                               sequence_token_embeddings=seq_token_embeddings,
+                                               sequence_lengths=seq_token_lengths,
+                                               sequence_token_masks=seq_token_masks)
--- a/src/encoders/seq_encoder.py
+++ b/src/encoders/seq_encoder.py
@ -0,0 +1,225 @@
+from collections import Counter
+import numpy as np
+from typing import Dict, Any, List, Iterable, Optional, Tuple
+import random
+import re
+
+from utils.bpevocabulary import BpeVocabulary
+from utils.tfutils import convert_and_pad_token_sequence
+
+import tensorflow as tf
+from dpu_utils.codeutils import split_identifier_into_parts
+from dpu_utils.mlutils import Vocabulary
+
+from .encoder import Encoder, QueryType
+
+
+class SeqEncoder(Encoder):
+    @classmethod
+    def get_default_hyperparameters(cls) -> Dict[str, Any]:
+        encoder_hypers = { 'token_vocab_size': 10000,
+                           'token_vocab_count_threshold': 10,
+                           'token_embedding_size': 128,
+
+                           'use_subtokens': False,
+                           'mark_subtoken_end': False,
+
+                           'max_num_tokens': 200,
+
+                           'use_bpe': True,
+                           'pct_bpe': 0.5
+                         }
+        hypers = super().get_default_hyperparameters()
+        hypers.update(encoder_hypers)
+        return hypers
+
+    IDENTIFIER_TOKEN_REGEX = re.compile('[_a-zA-Z][_a-zA-Z0-9]*')
+
+    def __init__(self, label: str, hyperparameters: Dict[str, Any], metadata: Dict[str, Any]):
+        super().__init__(label, hyperparameters, metadata)
+        if hyperparameters['%s_use_bpe' % label]:
+            assert not hyperparameters['%s_use_subtokens' % label], 'Subtokens cannot be used along with BPE.'
+        elif hyperparameters['%s_use_subtokens' % label]:
+            assert not hyperparameters['%s_use_bpe' % label], 'Subtokens cannot be used along with BPE.'
+
+    def _make_placeholders(self):
+        """
+        Creates placeholders "tokens" for sequence encoders.
+        """
+        super()._make_placeholders()
+        self.placeholders['tokens'] = \
+            tf.placeholder(tf.int32,
+                           shape=[None, self.get_hyper('max_num_tokens')],
+                           name='tokens')
+
+    def embedding_layer(self, token_inp: tf.Tensor) -> tf.Tensor:
+        """
+        Creates embedding layer that is in common between many encoders.
+
+        Args:
+            token_inp:  2D tensor that is of shape (batch size, sequence length)
+
+        Returns:
+            3D tensor of shape (batch size, sequence length, embedding dimension)
+        """
+
+        token_embeddings = tf.get_variable(name='token_embeddings',
+                                           initializer=tf.glorot_uniform_initializer(),
+                                           shape=[len(self.metadata['token_vocab']),
+                                                  self.get_hyper('token_embedding_size')],
+                                           )
+        self.__embeddings = token_embeddings
+
+        token_embeddings = tf.nn.dropout(token_embeddings,
+                                         keep_prob=self.placeholders['dropout_keep_rate'])
+
+        return tf.nn.embedding_lookup(params=token_embeddings, ids=token_inp)
+
+    @classmethod
+    def init_metadata(cls) -> Dict[str, Any]:
+        raw_metadata = super().init_metadata()
+        raw_metadata['token_counter'] = Counter()
+        return raw_metadata
+
+    @classmethod
+    def _to_subtoken_stream(cls, input_stream: Iterable[str], mark_subtoken_end: bool) -> Iterable[str]:
+        for token in input_stream:
+            if SeqEncoder.IDENTIFIER_TOKEN_REGEX.match(token):
+                yield from split_identifier_into_parts(token)
+                if mark_subtoken_end:
+                    yield '</id>'
+            else:
+                yield token
+
+    @classmethod
+    def load_metadata_from_sample(cls, data_to_load: Iterable[str], raw_metadata: Dict[str, Any],
+                                  use_subtokens: bool=False, mark_subtoken_end: bool=False) -> None:
+        if use_subtokens:
+            data_to_load = cls._to_subtoken_stream(data_to_load, mark_subtoken_end=mark_subtoken_end)
+        raw_metadata['token_counter'].update(data_to_load)
+
+    @classmethod
+    def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
+        final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list)
+        merged_token_counter = Counter()
+        for raw_metadata in raw_metadata_list:
+            merged_token_counter += raw_metadata['token_counter']
+
+        if hyperparameters['%s_use_bpe' % encoder_label]:
+            token_vocabulary = BpeVocabulary(vocab_size=hyperparameters['%s_token_vocab_size' % encoder_label],
+                                             pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label]
+                                             )
+            token_vocabulary.fit(merged_token_counter)
+        else:
+            token_vocabulary = Vocabulary.create_vocabulary(tokens=merged_token_counter,
+                                                            max_size=hyperparameters['%s_token_vocab_size' % encoder_label],
+                                                            count_threshold=hyperparameters['%s_token_vocab_count_threshold' % encoder_label])
+
+        final_metadata['token_vocab'] = token_vocabulary
+        # Save the most common tokens for use in data augmentation:
+        final_metadata['common_tokens'] = merged_token_counter.most_common(50)
+        return final_metadata
+
+    @classmethod
+    def load_data_from_sample(cls,
+                              encoder_label: str,
+                              hyperparameters: Dict[str, Any],
+                              metadata: Dict[str, Any],
+                              data_to_load: Any,
+                              function_name: Optional[str],
+                              result_holder: Dict[str, Any],
+                              is_test: bool = True) -> bool:
+        """
+        Saves two versions of both the code and the query: one using the docstring as the query and the other using the
+        function-name as the query, and replacing the function name in the code with an out-of-vocab token.
+        Sub-tokenizes, converts, and pads both versions, and rejects empty samples.
+        """
+        # Save the two versions of the code and query:
+        data_holder = {QueryType.DOCSTRING.value: data_to_load, QueryType.FUNCTION_NAME.value: None}
+        # Skip samples where the function name is very short, because it probably has too little information
+        # to be a good search query.
+        if not is_test and hyperparameters['fraction_using_func_name'] > 0. and function_name and \
+                len(function_name) >= hyperparameters['min_len_func_name_for_query']:
+            if encoder_label == 'query':
+                # Set the query tokens to the function name, broken up into its sub-tokens:
+                data_holder[QueryType.FUNCTION_NAME.value] = split_identifier_into_parts(function_name)
+            elif encoder_label == 'code':
+                # In the code, replace the function name with the out-of-vocab token everywhere it appears:
+                data_holder[QueryType.FUNCTION_NAME.value] = [Vocabulary.get_unk() if token == function_name else token
+                                                              for token in data_to_load]
+
+        # Sub-tokenize, convert, and pad both versions:
+        for key, data in data_holder.items():
+            if not data:
+                result_holder[f'{encoder_label}_tokens_{key}'] = None
+                result_holder[f'{encoder_label}_tokens_mask_{key}'] = None
+                result_holder[f'{encoder_label}_tokens_length_{key}'] = None
+                continue
+            if hyperparameters[f'{encoder_label}_use_subtokens']:
+                data = cls._to_subtoken_stream(data,
+                                               mark_subtoken_end=hyperparameters[
+                                                   f'{encoder_label}_mark_subtoken_end'])
+            tokens, tokens_mask = \
+                convert_and_pad_token_sequence(metadata['token_vocab'], list(data),
+                                               hyperparameters[f'{encoder_label}_max_num_tokens'])
+            # Note that we share the result_holder with different encoders, and so we need to make our identifiers
+            # unique-ish
+            result_holder[f'{encoder_label}_tokens_{key}'] = tokens
+            result_holder[f'{encoder_label}_tokens_mask_{key}'] = tokens_mask
+            result_holder[f'{encoder_label}_tokens_length_{key}'] = int(np.sum(tokens_mask))
+
+        if result_holder[f'{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}'] is None or \
+                int(np.sum(result_holder[f'{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}'])) == 0:
+            return False
+
+        return True
+
+    def extend_minibatch_by_sample(self, batch_data: Dict[str, Any], sample: Dict[str, Any], is_train: bool=False,
+                                   query_type: QueryType = QueryType.DOCSTRING.value) -> bool:
+        """
+        Implements various forms of data augmentation.
+        """
+        current_sample = dict()
+
+        # Train with some fraction of samples having their query set to the function name instead of the docstring, and
+        # their function name replaced with out-of-vocab in the code:
+        current_sample['tokens'] = sample[f'{self.label}_tokens_{query_type}']
+        current_sample['tokens_mask'] = sample[f'{self.label}_tokens_mask_{query_type}']
+        current_sample['tokens_lengths'] = sample[f'{self.label}_tokens_length_{query_type}']
+
+        # In the query, randomly add high-frequency tokens:
+        # TODO: Add tokens with frequency proportional to their frequency in the vocabulary
+        if is_train and self.label == 'query' and self.hyperparameters['query_random_token_frequency'] > 0.:
+            total_length = len(current_sample['tokens'])
+            length_without_padding = current_sample['tokens_lengths']
+            # Generate a list of places in which to insert tokens:
+            insert_indices = np.array([random.uniform(0., 1.) for _ in range(length_without_padding)])  # don't allow insertions in the padding
+            insert_indices = insert_indices < self.hyperparameters['query_random_token_frequency']  # insert at the correct frequency
+            insert_indices = np.flatnonzero(insert_indices)
+            if len(insert_indices) > 0:
+                # Generate the random tokens to add:
+                tokens_to_add = [random.randrange(0, len(self.metadata['common_tokens']))
+                                 for _ in range(len(insert_indices))]  # select one of the most common tokens for each location
+                tokens_to_add = [self.metadata['common_tokens'][token][0] for token in tokens_to_add]  # get the word corresponding to the token we're adding
+                tokens_to_add = [self.metadata['token_vocab'].get_id_or_unk(token) for token in tokens_to_add]  # get the index within the vocab of the token we're adding
+                # Efficiently insert the added tokens, leaving the total length the same:
+                to_insert = 0
+                output_query = np.zeros(total_length, dtype=int)
+                for idx in range(min(length_without_padding, total_length - len(insert_indices))):  # iterate only through the beginning of the array where changes are being made
+                    if to_insert < len(insert_indices) and idx == insert_indices[to_insert]:
+                        output_query[idx + to_insert] = tokens_to_add[to_insert]
+                        to_insert += 1
+                    output_query[idx + to_insert] = current_sample['tokens'][idx]
+                current_sample['tokens'] = output_query
+                # Add the needed number of non-padding values to the mask:
+                current_sample['tokens_mask'][length_without_padding:length_without_padding + len(tokens_to_add)] = 1.
+                current_sample['tokens_lengths'] += len(tokens_to_add)
+
+        # Add the current sample to the minibatch:
+        [batch_data[key].append(current_sample[key]) for key in current_sample.keys() if key in batch_data.keys()]
+
+        return False
+
+    def get_token_embeddings(self) -> Tuple[tf.Tensor, List[str]]:
+        return (self.__embeddings,
+                list(self.metadata['token_vocab'].id_to_token))
--- a/src/encoders/utils/init.py
+++ b/src/encoders/utils/init.py
--- a/src/encoders/utils/bert_self_attention.py
+++ b/src/encoders/utils/bert_self_attention.py
@ -0,0 +1,951 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The main BERT model and related functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import json
+import math
+import re
+import six
+import tensorflow as tf
+
+from utils.tfutils import get_activation
+
+
+class BertConfig(object):
+  """Configuration for `BertModel`."""
+
+  def __init__(self,
+               vocab_size,
+               hidden_size=768,
+               num_hidden_layers=12,
+               num_attention_heads=12,
+               intermediate_size=3072,
+               hidden_act="gelu",
+               hidden_dropout_prob=0.1,
+               attention_probs_dropout_prob=0.1,
+               max_position_embeddings=512,
+               type_vocab_size=16,
+               initializer_range=0.02):
+    """Constructs BertConfig.
+
+    Args:
+      vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
+      hidden_size: Size of the encoder layers and the pooler layer.
+      num_hidden_layers: Number of hidden layers in the Transformer encoder.
+      num_attention_heads: Number of attention heads for each attention layer in
+        the Transformer encoder.
+      intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+        layer in the Transformer encoder.
+      hidden_act: The non-linear activation function (function or string) in the
+        encoder and pooler.
+      hidden_dropout_prob: The dropout probability for all fully connected
+        layers in the embeddings, encoder, and pooler.
+      attention_probs_dropout_prob: The dropout ratio for the attention
+        probabilities.
+      max_position_embeddings: The maximum sequence length that this model might
+        ever be used with. Typically set this to something large just in case
+        (e.g., 512 or 1024 or 2048).
+      type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+        `BertModel`.
+      initializer_range: The stdev of the truncated_normal_initializer for
+        initializing all weight matrices.
+    """
+    self.vocab_size = vocab_size
+    self.hidden_size = hidden_size
+    self.num_hidden_layers = num_hidden_layers
+    self.num_attention_heads = num_attention_heads
+    self.hidden_act = hidden_act
+    self.intermediate_size = intermediate_size
+    self.hidden_dropout_prob = hidden_dropout_prob
+    self.attention_probs_dropout_prob = attention_probs_dropout_prob
+    self.max_position_embeddings = max_position_embeddings
+    self.type_vocab_size = type_vocab_size
+    self.initializer_range = initializer_range
+
+  @classmethod
+  def from_dict(cls, json_object):
+    """Constructs a `BertConfig` from a Python dictionary of parameters."""
+    config = BertConfig(vocab_size=None)
+    for (key, value) in six.iteritems(json_object):
+      config.__dict__[key] = value
+    return config
+
+  @classmethod
+  def from_json_file(cls, json_file):
+    """Constructs a `BertConfig` from a json file of parameters."""
+    with tf.gfile.GFile(json_file, "r") as reader:
+      text = reader.read()
+    return cls.from_dict(json.loads(text))
+
+  def to_dict(self):
+    """Serializes this instance to a Python dictionary."""
+    output = copy.deepcopy(self.__dict__)
+    return output
+
+  def to_json_string(self):
+    """Serializes this instance to a JSON string."""
+    return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+
+class BertModel(object):
+  """BERT model ("Bidirectional Embedding Representations from a Transformer").
+
+  Example usage:
+
+  ```python
+  # Already been converted into WordPiece token ids
+  input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
+  input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
+  token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
+
+  config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
+    num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
+
+  model = modeling.BertModel(config=config, is_training=True,
+    input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
+
+  label_embeddings = tf.get_variable(...)
+  pooled_output = model.get_pooled_output()
+  logits = tf.matmul(pooled_output, label_embeddings)
+  ...
+  ```
+  """
+
+  def __init__(self,
+               config,
+               is_training,
+               input_ids,
+               input_mask=None,
+               token_type_ids=None,
+               use_one_hot_embeddings=True,
+               scope=None,
+               embedded_input=None):
+    """Constructor for BertModel.
+
+    Args:
+      config: `BertConfig` instance.
+      is_training: bool. rue for training model, false for eval model. Controls
+        whether dropout will be applied.
+      input_ids: int32 Tensor of shape [batch_size, seq_length].
+      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
+      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
+      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
+        embeddings or tf.embedding_lookup() for the word embeddings. On the TPU,
+        it is must faster if this is True, on the CPU or GPU, it is faster if
+        this is False.
+      scope: (optional) variable scope. Defaults to "bert".
+      embedded_input: (optional) If provided, the embedding layer here is
+        skipped and the passed embeddings are passed into the self-attentional
+        layers.
+
+    Raises:
+      ValueError: The config is invalid or one of the input tensor shapes
+        is invalid.
+    """
+    config = copy.deepcopy(config)
+    if not is_training:
+      config.hidden_dropout_prob = 0.0
+      config.attention_probs_dropout_prob = 0.0
+
+    input_shape = get_shape_list(input_ids, expected_rank=2)
+    batch_size = input_shape[0]
+    seq_length = input_shape[1]
+
+    if input_mask is None:
+      input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
+
+    if token_type_ids is None:
+      token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
+
+    with tf.variable_scope("bert", scope):
+      with tf.variable_scope("embeddings"):
+        if embedded_input is None:
+          # Perform embedding lookup on the word ids.
+          (self.embedding_output, self.embedding_table) = embedding_lookup(
+              input_ids=input_ids,
+              vocab_size=config.vocab_size,
+              embedding_size=config.hidden_size,
+              initializer_range=config.initializer_range,
+              word_embedding_name="word_embeddings",
+              use_one_hot_embeddings=use_one_hot_embeddings)
+        else:
+          self.embedding_output = embedded_input
+
+        # Add positional embeddings and token type embeddings, then layer
+        # normalize and perform dropout.
+        self.embedding_output = embedding_postprocessor(
+            input_tensor=self.embedding_output,
+            use_token_type=True,
+            token_type_ids=token_type_ids,
+            token_type_vocab_size=config.type_vocab_size,
+            token_type_embedding_name="token_type_embeddings",
+            use_position_embeddings=True,
+            position_embedding_name="position_embeddings",
+            initializer_range=config.initializer_range,
+            max_position_embeddings=config.max_position_embeddings,
+            dropout_prob=config.hidden_dropout_prob)
+
+      with tf.variable_scope("encoder"):
+        # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
+        # mask of shape [batch_size, seq_length, seq_length] which is used
+        # for the attention scores.
+        attention_mask = create_attention_mask_from_input_mask(
+            input_ids, input_mask)
+
+        # Run the stacked transformer.
+        # `sequence_output` shape = [batch_size, seq_length, hidden_size].
+        self.all_encoder_layers = transformer_model(
+            input_tensor=self.embedding_output,
+            attention_mask=attention_mask,
+            hidden_size=config.hidden_size,
+            num_hidden_layers=config.num_hidden_layers,
+            num_attention_heads=config.num_attention_heads,
+            intermediate_size=config.intermediate_size,
+            intermediate_act_fn=get_activation(config.hidden_act),
+            hidden_dropout_prob=config.hidden_dropout_prob,
+            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
+            initializer_range=config.initializer_range,
+            do_return_all_layers=True)
+
+      self.sequence_output = self.all_encoder_layers[-1]
+      # The "pooler" converts the encoded sequence tensor of shape
+      # [batch_size, seq_length, hidden_size] to a tensor of shape
+      # [batch_size, hidden_size]. This is necessary for segment-level
+      # (or segment-pair-level) classification tasks where we need a fixed
+      # dimensional representation of the segment.
+      with tf.variable_scope("pooler"):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token. We assume that this has been pre-trained
+        first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
+        self.pooled_output = tf.layers.dense(
+            first_token_tensor,
+            config.hidden_size,
+            activation=tf.tanh,
+            kernel_initializer=create_initializer(config.initializer_range))
+
+  def get_pooled_output(self):
+    return self.pooled_output
+
+  def get_sequence_output(self):
+    """Gets final hidden layer of encoder.
+
+    Returns:
+      float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
+      to the final hidden of the transformer encoder.
+    """
+    return self.sequence_output
+
+  def get_all_encoder_layers(self):
+    return self.all_encoder_layers
+
+  def get_embedding_output(self):
+    """Gets output of the embedding lookup (i.e., input to the transformer).
+
+    Returns:
+      float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
+      to the output of the embedding layer, after summing the word
+      embeddings with the positional embeddings and the token type embeddings,
+      then performing layer normalization. This is the input to the transformer.
+    """
+    return self.embedding_output
+
+  def get_embedding_table(self):
+    return self.embedding_table
+
+
+def get_assigment_map_from_checkpoint(tvars, init_checkpoint):
+  """Compute the union of the current variables and checkpoint variables."""
+  assignment_map = {}
+  initialized_variable_names = {}
+
+  name_to_variable = collections.OrderedDict()
+  for var in tvars:
+    name = var.name
+    m = re.match("^(.*):\\d+$", name)
+    if m is not None:
+      name = m.group(1)
+    name_to_variable[name] = var
+
+  init_vars = tf.train.list_variables(init_checkpoint)
+
+  assignment_map = collections.OrderedDict()
+  for x in init_vars:
+    (name, var) = (x[0], x[1])
+    if name not in name_to_variable:
+      continue
+    assignment_map[name] = name
+    initialized_variable_names[name] = 1
+    initialized_variable_names[name + ":0"] = 1
+
+  return (assignment_map, initialized_variable_names)
+
+
+def dropout(input_tensor, dropout_prob):
+  """Perform dropout.
+
+  Args:
+    input_tensor: float Tensor.
+    dropout_prob: Python float. The probability of dropping out a value (NOT of
+      *keeping* a dimension as in `tf.nn.dropout`).
+
+  Returns:
+    A version of `input_tensor` with dropout applied.
+  """
+  if dropout_prob is None or dropout_prob == 0.0:
+    return input_tensor
+
+  output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
+  return output
+
+
+def layer_norm(input_tensor, name=None):
+  """Run layer normalization on the last dimension of the tensor."""
+  return tf.contrib.layers.layer_norm(
+      inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
+
+
+def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
+  """Runs layer normalization followed by dropout."""
+  output_tensor = layer_norm(input_tensor, name)
+  output_tensor = dropout(output_tensor, dropout_prob)
+  return output_tensor
+
+
+def create_initializer(initializer_range=0.02):
+  """Creates a `truncated_normal_initializer` with the given range."""
+  return tf.truncated_normal_initializer(stddev=initializer_range)
+
+
+def embedding_lookup(input_ids,
+                     vocab_size,
+                     embedding_size=128,
+                     initializer_range=0.02,
+                     word_embedding_name="word_embeddings",
+                     use_one_hot_embeddings=False):
+  """Looks up words embeddings for id tensor.
+
+  Args:
+    input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
+      ids.
+    vocab_size: int. Size of the embedding vocabulary.
+    embedding_size: int. Width of the word embeddings.
+    initializer_range: float. Embedding initialization range.
+    word_embedding_name: string. Name of the embedding table.
+    use_one_hot_embeddings: bool. If True, use one-hot method for word
+      embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better
+      for TPUs.
+
+  Returns:
+    float Tensor of shape [batch_size, seq_length, embedding_size].
+  """
+  # This function assumes that the input is of shape [batch_size, seq_length,
+  # num_inputs].
+  #
+  # If the input is a 2D tensor of shape [batch_size, seq_length], we
+  # reshape to [batch_size, seq_length, 1].
+  if input_ids.shape.ndims == 2:
+    input_ids = tf.expand_dims(input_ids, axis=[-1])
+
+  embedding_table = tf.get_variable(
+      name=word_embedding_name,
+      shape=[vocab_size, embedding_size],
+      initializer=create_initializer(initializer_range))
+
+  if use_one_hot_embeddings:
+    flat_input_ids = tf.reshape(input_ids, [-1])
+    one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
+    output = tf.matmul(one_hot_input_ids, embedding_table)
+  else:
+    output = tf.nn.embedding_lookup(embedding_table, input_ids)
+
+  input_shape = get_shape_list(input_ids)
+
+  output = tf.reshape(output,
+                      input_shape[0:-1] + [input_shape[-1] * embedding_size])
+  return (output, embedding_table)
+
+
+def embedding_postprocessor(input_tensor,
+                            use_token_type=False,
+                            token_type_ids=None,
+                            token_type_vocab_size=16,
+                            token_type_embedding_name="token_type_embeddings",
+                            use_position_embeddings=True,
+                            position_embedding_name="position_embeddings",
+                            initializer_range=0.02,
+                            max_position_embeddings=512,
+                            dropout_prob=0.1):
+  """Performs various post-processing on a word embedding tensor.
+
+  Args:
+    input_tensor: float Tensor of shape [batch_size, seq_length,
+      embedding_size].
+    use_token_type: bool. Whether to add embeddings for `token_type_ids`.
+    token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
+      Must be specified if `use_token_type` is True.
+    token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
+    token_type_embedding_name: string. The name of the embedding table variable
+      for token type ids.
+    use_position_embeddings: bool. Whether to add position embeddings for the
+      position of each token in the sequence.
+    position_embedding_name: string. The name of the embedding table variable
+      for positional embeddings.
+    initializer_range: float. Range of the weight initialization.
+    max_position_embeddings: int. Maximum sequence length that might ever be
+      used with this model. This can be longer than the sequence length of
+      input_tensor, but cannot be shorter.
+    dropout_prob: float. Dropout probability applied to the final output tensor.
+
+  Returns:
+    float tensor with same shape as `input_tensor`.
+
+  Raises:
+    ValueError: One of the tensor shapes or input values is invalid.
+  """
+  input_shape = get_shape_list(input_tensor, expected_rank=3)
+  batch_size = input_shape[0]
+  seq_length = input_shape[1]
+  width = input_shape[2]
+
+  if seq_length > max_position_embeddings:
+    raise ValueError("The seq length (%d) cannot be greater than "
+                     "`max_position_embeddings` (%d)" %
+                     (seq_length, max_position_embeddings))
+
+  output = input_tensor
+
+  if use_token_type:
+    if token_type_ids is None:
+      raise ValueError("`token_type_ids` must be specified if"
+                       "`use_token_type` is True.")
+    token_type_table = tf.get_variable(
+        name=token_type_embedding_name,
+        shape=[token_type_vocab_size, width],
+        initializer=create_initializer(initializer_range))
+    # This vocab will be small so we always do one-hot here, since it is always
+    # faster for a small vocabulary.
+    flat_token_type_ids = tf.reshape(token_type_ids, [-1])
+    one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
+    token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
+    token_type_embeddings = tf.reshape(token_type_embeddings,
+                                       [batch_size, seq_length, width])
+    output += token_type_embeddings
+
+  if use_position_embeddings:
+    full_position_embeddings = tf.get_variable(
+        name=position_embedding_name,
+        shape=[max_position_embeddings, width],
+        initializer=create_initializer(initializer_range))
+    # Since the position embedding table is a learned variable, we create it
+    # using a (long) sequence length `max_position_embeddings`. The actual
+    # sequence length might be shorter than this, for faster training of
+    # tasks that do not have long sequences.
+    #
+    # So `full_position_embeddings` is effectively an embedding table
+    # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
+    # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
+    # perform a slice.
+    if seq_length < max_position_embeddings:
+      position_embeddings = tf.slice(full_position_embeddings, [0, 0],
+                                     [seq_length, -1])
+    else:
+      position_embeddings = full_position_embeddings
+
+    num_dims = len(output.shape.as_list())
+
+    # Only the last two dimensions are relevant (`seq_length` and `width`), so
+    # we broadcast among the first dimensions, which is typically just
+    # the batch size.
+    position_broadcast_shape = []
+    for _ in range(num_dims - 2):
+      position_broadcast_shape.append(1)
+    position_broadcast_shape.extend([seq_length, width])
+    position_embeddings = tf.reshape(position_embeddings,
+                                     position_broadcast_shape)
+    output += position_embeddings
+
+  output = layer_norm_and_dropout(output, dropout_prob)
+  return output
+
+
+def create_attention_mask_from_input_mask(from_tensor, to_mask):
+  """Create 3D attention mask from a 2D tensor mask.
+
+  Args:
+    from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
+    to_mask: int32 Tensor of shape [batch_size, to_seq_length].
+
+  Returns:
+    float Tensor of shape [batch_size, from_seq_length, to_seq_length].
+  """
+  from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
+  batch_size = from_shape[0]
+  from_seq_length = from_shape[1]
+
+  to_shape = get_shape_list(to_mask, expected_rank=2)
+  to_seq_length = to_shape[1]
+
+  to_mask = tf.cast(
+      tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
+
+  # We don't assume that `from_tensor` is a mask (although it could be). We
+  # don't actually care if we attend *from* padding tokens (only *to* padding)
+  # tokens so we create a tensor of all ones.
+  #
+  # `broadcast_ones` = [batch_size, from_seq_length, 1]
+  broadcast_ones = tf.ones(
+      shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
+
+  # Here we broadcast along two dimensions to create the mask.
+  mask = broadcast_ones * to_mask
+
+  return mask
+
+
+def attention_layer(from_tensor,
+                    to_tensor,
+                    attention_mask=None,
+                    num_attention_heads=1,
+                    size_per_head=512,
+                    query_act=None,
+                    key_act=None,
+                    value_act=None,
+                    attention_probs_dropout_prob=0.0,
+                    initializer_range=0.02,
+                    do_return_2d_tensor=False,
+                    batch_size=None,
+                    from_seq_length=None,
+                    to_seq_length=None):
+  """Performs multi-headed attention from `from_tensor` to `to_tensor`.
+
+  This is an implementation of multi-headed attention based on "Attention
+  is all you Need". If `from_tensor` and `to_tensor` are the same, then
+  this is self-attention. Each timestep in `from_tensor` attends to the
+  corresponding sequence in `to_tensor`, and returns a fixed-with vector.
+
+  This function first projects `from_tensor` into a "query" tensor and
+  `to_tensor` into "key" and "value" tensors. These are (effectively) a list
+  of tensors of length `num_attention_heads`, where each tensor is of shape
+  [batch_size, seq_length, size_per_head].
+
+  Then, the query and key tensors are dot-producted and scaled. These are
+  softmaxed to obtain attention probabilities. The value tensors are then
+  interpolated by these probabilities, then concatenated back to a single
+  tensor and returned.
+
+  In practice, the multi-headed attention are done with transposes and
+  reshapes rather than actual separate tensors.
+
+  Args:
+    from_tensor: float Tensor of shape [batch_size, from_seq_length,
+      from_width].
+    to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
+    attention_mask: (optional) int32 Tensor of shape [batch_size,
+      from_seq_length, to_seq_length]. The values should be 1 or 0. The
+      attention scores will effectively be set to -infinity for any positions in
+      the mask that are 0, and will be unchanged for positions that are 1.
+    num_attention_heads: int. Number of attention heads.
+    size_per_head: int. Size of each attention head.
+    query_act: (optional) Activation function for the query transform.
+    key_act: (optional) Activation function for the key transform.
+    value_act: (optional) Activation function for the value transform.
+    attention_probs_dropout_prob: (optional) float. Dropout probability of the
+      attention probabilities.
+    initializer_range: float. Range of the weight initializer.
+    do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
+      * from_seq_length, num_attention_heads * size_per_head]. If False, the
+      output will be of shape [batch_size, from_seq_length, num_attention_heads
+      * size_per_head].
+    batch_size: (Optional) int. If the input is 2D, this might be the batch size
+      of the 3D version of the `from_tensor` and `to_tensor`.
+    from_seq_length: (Optional) If the input is 2D, this might be the seq length
+      of the 3D version of the `from_tensor`.
+    to_seq_length: (Optional) If the input is 2D, this might be the seq length
+      of the 3D version of the `to_tensor`.
+
+  Returns:
+    float Tensor of shape [batch_size, from_seq_length,
+      num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
+      true, this will be of shape [batch_size * from_seq_length,
+      num_attention_heads * size_per_head]).
+
+  Raises:
+    ValueError: Any of the arguments or tensor shapes are invalid.
+  """
+
+  def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
+                           seq_length, width):
+    output_tensor = tf.reshape(
+        input_tensor, [batch_size, seq_length, num_attention_heads, width])
+
+    output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
+    return output_tensor
+
+  from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
+  to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
+
+  if len(from_shape) != len(to_shape):
+    raise ValueError(
+        "The rank of `from_tensor` must match the rank of `to_tensor`.")
+
+  if len(from_shape) == 3:
+    batch_size = from_shape[0]
+    from_seq_length = from_shape[1]
+    to_seq_length = to_shape[1]
+  elif len(from_shape) == 2:
+    if (batch_size is None or from_seq_length is None or to_seq_length is None):
+      raise ValueError(
+          "When passing in rank 2 tensors to attention_layer, the values "
+          "for `batch_size`, `from_seq_length`, and `to_seq_length` "
+          "must all be specified.")
+
+  # Scalar dimensions referenced here:
+  #   B = batch size (number of sequences)
+  #   F = `from_tensor` sequence length
+  #   T = `to_tensor` sequence length
+  #   N = `num_attention_heads`
+  #   H = `size_per_head`
+
+  from_tensor_2d = reshape_to_matrix(from_tensor)
+  to_tensor_2d = reshape_to_matrix(to_tensor)
+
+  # `query_layer` = [B*F, N*H]
+  query_layer = tf.layers.dense(
+      from_tensor_2d,
+      num_attention_heads * size_per_head,
+      activation=query_act,
+      name="query",
+      kernel_initializer=create_initializer(initializer_range))
+
+  # `key_layer` = [B*T, N*H]
+  key_layer = tf.layers.dense(
+      to_tensor_2d,
+      num_attention_heads * size_per_head,
+      activation=key_act,
+      name="key",
+      kernel_initializer=create_initializer(initializer_range))
+
+  # `value_layer` = [B*T, N*H]
+  value_layer = tf.layers.dense(
+      to_tensor_2d,
+      num_attention_heads * size_per_head,
+      activation=value_act,
+      name="value",
+      kernel_initializer=create_initializer(initializer_range))
+
+  # `query_layer` = [B, N, F, H]
+  query_layer = transpose_for_scores(query_layer, batch_size,
+                                     num_attention_heads, from_seq_length,
+                                     size_per_head)
+
+  # `key_layer` = [B, N, T, H]
+  key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
+                                   to_seq_length, size_per_head)
+
+  # Take the dot product between "query" and "key" to get the raw
+  # attention scores.
+  # `attention_scores` = [B, N, F, T]
+  attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+  attention_scores = tf.multiply(attention_scores,
+                                 1.0 / math.sqrt(float(size_per_head)))
+
+  if attention_mask is not None:
+    # `attention_mask` = [B, 1, F, T]
+    attention_mask = tf.expand_dims(attention_mask, axis=[1])
+
+    # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+    # masked positions, this operation will create a tensor which is 0.0 for
+    # positions we want to attend and -10000.0 for masked positions.
+    adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
+
+    # Since we are adding it to the raw scores before the softmax, this is
+    # effectively the same as removing these entirely.
+    attention_scores += adder
+
+  # Normalize the attention scores to probabilities.
+  # `attention_probs` = [B, N, F, T]
+  attention_probs = tf.nn.softmax(attention_scores)
+
+  # This is actually dropping out entire tokens to attend to, which might
+  # seem a bit unusual, but is taken from the original Transformer paper.
+  attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
+
+  # `value_layer` = [B, T, N, H]
+  value_layer = tf.reshape(
+      value_layer,
+      [batch_size, to_seq_length, num_attention_heads, size_per_head])
+
+  # `value_layer` = [B, N, T, H]
+  value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
+
+  # `context_layer` = [B, N, F, H]
+  context_layer = tf.matmul(attention_probs, value_layer)
+
+  # `context_layer` = [B, F, N, H]
+  context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
+
+  if do_return_2d_tensor:
+    # `context_layer` = [B*F, N*V]
+    context_layer = tf.reshape(
+        context_layer,
+        [batch_size * from_seq_length, num_attention_heads * size_per_head])
+  else:
+    # `context_layer` = [B, F, N*V]
+    context_layer = tf.reshape(
+        context_layer,
+        [batch_size, from_seq_length, num_attention_heads * size_per_head])
+
+  return context_layer
+
+
+def transformer_model(input_tensor,
+                      attention_mask=None,
+                      hidden_size=768,
+                      num_hidden_layers=12,
+                      num_attention_heads=12,
+                      intermediate_size=3072,
+                      intermediate_act_fn=get_activation('gelu'),
+                      hidden_dropout_prob=0.1,
+                      attention_probs_dropout_prob=0.1,
+                      initializer_range=0.02,
+                      do_return_all_layers=False):
+  """Multi-headed, multi-layer Transformer from "Attention is All You Need".
+
+  This is almost an exact implementation of the original Transformer encoder.
+
+  See the original paper:
+  https://arxiv.org/abs/1706.03762
+
+  Also see:
+  https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
+
+  Args:
+    input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
+    attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
+      seq_length], with 1 for positions that can be attended to and 0 in
+      positions that should not be.
+    hidden_size: int. Hidden size of the Transformer.
+    num_hidden_layers: int. Number of layers (blocks) in the Transformer.
+    num_attention_heads: int. Number of attention heads in the Transformer.
+    intermediate_size: int. The size of the "intermediate" (a.k.a., feed
+      forward) layer.
+    intermediate_act_fn: function. The non-linear activation function to apply
+      to the output of the intermediate/feed-forward layer.
+    hidden_dropout_prob: float. Dropout probability for the hidden layers.
+    attention_probs_dropout_prob: float. Dropout probability of the attention
+      probabilities.
+    initializer_range: float. Range of the initializer (stddev of truncated
+      normal).
+    do_return_all_layers: Whether to also return all layers or just the final
+      layer.
+
+  Returns:
+    float Tensor of shape [batch_size, seq_length, hidden_size], the final
+    hidden layer of the Transformer.
+
+  Raises:
+    ValueError: A Tensor shape or parameter is invalid.
+  """
+  if hidden_size % num_attention_heads != 0:
+    raise ValueError(
+        "The hidden size (%d) is not a multiple of the number of attention "
+        "heads (%d)" % (hidden_size, num_attention_heads))
+
+  attention_head_size = int(hidden_size / num_attention_heads)
+  input_shape = get_shape_list(input_tensor, expected_rank=3)
+  batch_size = input_shape[0]
+  seq_length = input_shape[1]
+  input_width = input_shape[2]
+
+  # The Transformer performs sum residuals on all layers so the input needs
+  # to be the same as the hidden size.
+  if input_width != hidden_size:
+    raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
+                     (input_width, hidden_size))
+
+  # We keep the representation as a 2D tensor to avoid re-shaping it back and
+  # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
+  # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
+  # help the optimizer.
+  prev_output = reshape_to_matrix(input_tensor)
+
+  all_layer_outputs = []
+  for layer_idx in range(num_hidden_layers):
+    with tf.variable_scope("layer_%d" % layer_idx):
+      layer_input = prev_output
+
+      with tf.variable_scope("attention"):
+        attention_heads = []
+        with tf.variable_scope("self"):
+          attention_head = attention_layer(
+              from_tensor=layer_input,
+              to_tensor=layer_input,
+              attention_mask=attention_mask,
+              num_attention_heads=num_attention_heads,
+              size_per_head=attention_head_size,
+              attention_probs_dropout_prob=attention_probs_dropout_prob,
+              initializer_range=initializer_range,
+              do_return_2d_tensor=True,
+              batch_size=batch_size,
+              from_seq_length=seq_length,
+              to_seq_length=seq_length)
+          attention_heads.append(attention_head)
+
+        attention_output = None
+        if len(attention_heads) == 1:
+          attention_output = attention_heads[0]
+        else:
+          # In the case where we have other sequences, we just concatenate
+          # them to the self-attention head before the projection.
+          attention_output = tf.concat(attention_heads, axis=-1)
+
+        # Run a linear projection of `hidden_size` then add a residual
+        # with `layer_input`.
+        with tf.variable_scope("output"):
+          attention_output = tf.layers.dense(
+              attention_output,
+              hidden_size,
+              kernel_initializer=create_initializer(initializer_range))
+          attention_output = dropout(attention_output, hidden_dropout_prob)
+          attention_output = layer_norm(attention_output + layer_input)
+
+      # The activation is only applied to the "intermediate" hidden layer.
+      with tf.variable_scope("intermediate"):
+        intermediate_output = tf.layers.dense(
+            attention_output,
+            intermediate_size,
+            activation=intermediate_act_fn,
+            kernel_initializer=create_initializer(initializer_range))
+
+      # Down-project back to `hidden_size` then add the residual.
+      with tf.variable_scope("output"):
+        layer_output = tf.layers.dense(
+            intermediate_output,
+            hidden_size,
+            kernel_initializer=create_initializer(initializer_range))
+        layer_output = dropout(layer_output, hidden_dropout_prob)
+        layer_output = layer_norm(layer_output + attention_output)
+        prev_output = layer_output
+        all_layer_outputs.append(layer_output)
+
+  if do_return_all_layers:
+    final_outputs = []
+    for layer_output in all_layer_outputs:
+      final_output = reshape_from_matrix(layer_output, input_shape)
+      final_outputs.append(final_output)
+    return final_outputs
+  else:
+    final_output = reshape_from_matrix(prev_output, input_shape)
+    return final_output
+
+
+def get_shape_list(tensor, expected_rank=None, name=None):
+  """Returns a list of the shape of tensor, preferring static dimensions.
+
+  Args:
+    tensor: A tf.Tensor object to find the shape of.
+    expected_rank: (optional) int. The expected rank of `tensor`. If this is
+      specified and the `tensor` has a different rank, and exception will be
+      thrown.
+    name: Optional name of the tensor for the error message.
+
+  Returns:
+    A list of dimensions of the shape of tensor. All static dimensions will
+    be returned as python integers, and dynamic dimensions will be returned
+    as tf.Tensor scalars.
+  """
+  if name is None:
+    name = tensor.name
+
+  if expected_rank is not None:
+    assert_rank(tensor, expected_rank, name)
+
+  shape = tensor.shape.as_list()
+
+  non_static_indexes = []
+  for (index, dim) in enumerate(shape):
+    if dim is None:
+      non_static_indexes.append(index)
+
+  if not non_static_indexes:
+    return shape
+
+  dyn_shape = tf.shape(tensor)
+  for index in non_static_indexes:
+    shape[index] = dyn_shape[index]
+  return shape
+
+
+def reshape_to_matrix(input_tensor):
+  """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
+  ndims = input_tensor.shape.ndims
+  if ndims < 2:
+    raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
+                     (input_tensor.shape))
+  if ndims == 2:
+    return input_tensor
+
+  width = input_tensor.shape[-1]
+  output_tensor = tf.reshape(input_tensor, [-1, width])
+  return output_tensor
+
+
+def reshape_from_matrix(output_tensor, orig_shape_list):
+  """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
+  if len(orig_shape_list) == 2:
+    return output_tensor
+
+  output_shape = get_shape_list(output_tensor)
+
+  orig_dims = orig_shape_list[0:-1]
+  width = output_shape[-1]
+
+  return tf.reshape(output_tensor, orig_dims + [width])
+
+
+def assert_rank(tensor, expected_rank, name=None):
+  """Raises an exception if the tensor rank is not of the expected rank.
+
+  Args:
+    tensor: A tf.Tensor to check the rank of.
+    expected_rank: Python integer or list of integers, expected rank.
+    name: Optional name of the tensor for the error message.
+
+  Raises:
+    ValueError: If the expected shape doesn't match the actual shape.
+  """
+  if name is None:
+    name = tensor.name
+
+  expected_rank_dict = {}
+  if isinstance(expected_rank, six.integer_types):
+    expected_rank_dict[expected_rank] = True
+  else:
+    for x in expected_rank:
+      expected_rank_dict[x] = True
+
+  actual_rank = tensor.shape.ndims
+  if actual_rank not in expected_rank_dict:
+    scope_name = tf.get_variable_scope().name
+    raise ValueError(
+        "For the tensor `%s` in scope `%s`, the actual rank "
+        "`%d` (shape = %s) is not equal to the expected rank `%s`" %
+        (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
--- a/src/error_analysis.py
+++ b/src/error_analysis.py
@ -0,0 +1,162 @@
+#!/usr/bin/env python
+"""
+Usage:
+    error_analysis.py [options] MODEL_PATH (--standard-dataset | --method2code-dataset) DATA_PATH OUT_FILE
+
+Options:
+    -h --help                        Show this screen.
+    --max-num-epochs EPOCHS          The maximum number of epochs to run [default: 300]
+    --max-num-files INT              Number of files to load.
+    --max-num-examples INT           Randomly sample examples from the dataset to display.
+    --hypers-override HYPERS         JSON dictionary overriding hyperparameter values.
+    --hypers-override-file FILE      JSON file overriding hyperparameter values.
+    --test-batch-size SIZE           The size of the batches in which to compute MRR. [default: 1000]
+    --distance-metric METRIC         The distance metric to use [default: cosine]
+    --quiet                          Less output (not one per line per minibatch). [default: False]
+    --azure-info PATH                Azure authentication information file (JSON). Used to load data from Azure storage.
+    --debug                          Enable debug routines. [default: False]
+    --standard-dataset               The DATA_PATH is to a standard dataset.
+    --method2code-dataset            The DATA_PATH is to a standard dataset but will be used for method2code tasks.
+    --language-to-analyze LANG       The language to analyze. Defaults to all.
+"""
+import io
+import json
+from typing import List, Dict, Any, Optional
+from tqdm import tqdm
+
+from pygments import highlight
+from pygments.lexers import get_lexer_by_name
+from pygments.formatters import HtmlFormatter
+from docopt import docopt
+from dpu_utils.utils import run_and_debug, RichPath
+
+import model_test
+from model_test import expand_data_path, MrrSearchTester
+from random import sample
+
+
+## Default Bootstrap headers
+HEADER=f"""
+<!doctype html>
+<html lang="en">
+  <head>
+    <!-- Required meta tags -->
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+
+    <!-- Bootstrap CSS -->
+    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">
+
+    <title>Error Analysis</title>
+    
+    <style>
+        {HtmlFormatter().get_style_defs('.highlight')}
+    </style>
+  </head>
+  <body>
+"""
+FOOTER="""
+</body></html>
+"""
+
+def to_highlighted_html(code:str, language: str) -> str:
+    lexer = get_lexer_by_name(language, stripall=True)
+    formatter = HtmlFormatter(linenos=True)
+    return highlight(code, lexer, formatter)
+
+def generate_html_error_report(tester: MrrSearchTester,
+                               data:  List[Dict[str, Any]],
+                               max_num_examples: Optional[int],
+                               outfile: str,
+                               filter_language: Optional[str] = None) -> None:
+
+    error_log = []  # type: List[MrrSearchTester.QueryResult]
+    # Sample the data if requested
+    data = sample_data(data=data,
+                       max_num_examples=max_num_examples)
+
+    # generate error logs
+    tester.update_test_batch_size(max_num_examples)
+    tester.evaluate(data, 'Error Analysis Run', error_log, filter_language=filter_language)
+
+    "Generates HTML Report of Errors."
+    print('Generating Report')
+    with open(outfile, 'w') as f:
+        f.write(HEADER)
+        for query_result in tqdm(error_log, total=len(error_log)):
+            with io.StringIO() as sb:
+                target_code = data[query_result.target_idx]['code']
+                target_query = data[query_result.target_idx]['docstring'].replace('\n', ' ')
+                language = data[query_result.target_idx]['language']
+                sb.write(f'<h2> Query: "{target_query}"</h2>\n\n')
+                sb.write(f'<strong>Target Snippet</strong>\n{to_highlighted_html(target_code, language=language)}\n')
+                sb.write(f'Target snippet was ranked at position <strong>{query_result.target_rank}</strong>.\n')
+
+                sb.write('<div class="row">\n')
+                for pos, sample_idx in enumerate(query_result.top_ranked_idxs):
+                    sb.write('<div class="col-sm">\n')
+                    sb.write(f'<strong>Result at {pos+1}</strong>\n')
+                    sb.write(f'{data[sample_idx]["repo"]} {data[sample_idx]["path"]}:{data[sample_idx]["lineno"]}\n')
+                    result_docstring = data[sample_idx]['docstring']
+                    result_code = data[sample_idx]['code']
+                    lang = data[sample_idx]['language']
+                    sb.write(f'<blockquote><p>  Docstring: <em>{result_docstring}</em></blockquote>\n{to_highlighted_html(result_code, language=lang)}\n\n')
+                    sb.write('</div>\n')
+                sb.write('</div>\n<hr/>\n')
+                f.write(sb.getvalue())
+        f.write(FOOTER)
+
+
+def sample_data(data: List[Dict[str, Any]],
+                max_num_examples: Optional[int]) -> List[Dict[str, Any]]:
+    """
+    Sample max_num_examples from the data.
+
+    Args:
+        data: List[Dict[str, Any]]
+        max_num_examples:  either an int or if a string will attempt conversion to an int.
+
+    Returns:
+        data: List[Dict[str, Any]]
+    """
+    if max_num_examples:
+        num_elements = min(len(data), max_num_examples)
+        print(f'Extracting {num_elements} random samples from dataset.')
+        data = sample(data, num_elements)
+
+    return data
+
+
+def run(arguments):
+    max_num_examples = int(arguments.get('--max-num-examples')) if arguments.get('--max-num-examples') else None
+    azure_info_path = arguments.get('--azure-info', None)
+    test_data_dirs = expand_data_path(arguments['DATA_PATH'], azure_info_path)
+
+    if arguments['--hypers-override'] is not None:
+        hypers_override = json.loads(arguments['--hypers-override'])
+    elif arguments['--hypers-override-file'] is not None:
+        with open(arguments['--hypers-override-file']) as f:
+            hypers_override = json.load(f)
+    else:
+        hypers_override = {}
+
+    model_path = RichPath.create(arguments['MODEL_PATH'], azure_info_path=azure_info_path)
+
+    tester = MrrSearchTester(model_path, test_batch_size=int(arguments['--test-batch-size']),
+                             distance_metric=arguments['--distance-metric'], hypers_override=hypers_override)
+
+    # Load dataset
+    if arguments['--standard-dataset'] or arguments['--method2code-dataset']:
+        data = model_test.get_dataset_from(test_data_dirs, use_func_names=arguments['--method2code-dataset'])
+    else:
+        raise Exception(f'No dataset option seems to have been passed in.')
+
+    generate_html_error_report(tester=tester,
+                               data=data,
+                               max_num_examples=max_num_examples,
+                               outfile=arguments['OUT_FILE'],
+                               filter_language=arguments.get('--language-to-analyze'))
+
+if __name__ == '__main__':
+    args = docopt(__doc__)
+    run_and_debug(lambda: run(args), args.get('--debug', False))
--- a/src/model_restore_helper.py
+++ b/src/model_restore_helper.py
@ -0,0 +1,58 @@
+from typing import Dict, Any, Optional, Type
+
+import tensorflow as tf
+from dpu_utils.utils import RichPath
+
+from models import Model, NeuralBoWModel, RNNModel, SelfAttentionModel, ConvolutionalModel, ConvSelfAttentionModel
+
+
+def get_model_class_from_name(model_name: str) -> Type[Model]:
+    model_name = model_name.lower()
+    if model_name in ['neuralbow', 'neuralbowmodel']:
+        return NeuralBoWModel
+    elif model_name in ['rnn', 'rnnmodel']:
+        return RNNModel
+    elif model_name in {'selfatt', 'selfattention', 'selfattentionmodel'}:
+        return SelfAttentionModel
+    elif model_name in {'1dcnn', 'convolutionalmodel'}:
+        return ConvolutionalModel
+    elif model_name in {'convselfatt', 'convselfattentionmodel'}:
+        return ConvSelfAttentionModel
+    else:
+        raise Exception("Unknown model '%s'!" % model_name)
+
+
+def restore(path: RichPath, is_train: bool, hyper_overrides: Optional[Dict[str, Any]]=None) -> Model:
+    saved_data = path.read_as_pickle()
+
+    if hyper_overrides is not None:
+        saved_data['hyperparameters'].update(hyper_overrides)
+
+    model_class = get_model_class_from_name(saved_data['model_type'])
+    model = model_class(saved_data['hyperparameters'], saved_data.get('run_name'))
+    model.query_metadata.update(saved_data['query_metadata'])
+    for (language, language_metadata) in saved_data['per_code_language_metadata'].items():
+        model.per_code_language_metadata[language] = language_metadata
+    model.make_model(is_train=is_train)
+
+    variables_to_initialize = []
+    with model.sess.graph.as_default():
+        with tf.name_scope("restore"):
+            restore_ops = []
+            used_vars = set()
+            for variable in sorted(model.sess.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES), key=lambda v: v.name):
+                used_vars.add(variable.name)
+                if variable.name in saved_data['weights']:
+                    # print('Initializing %s from saved value.' % variable.name)
+                    restore_ops.append(variable.assign(saved_data['weights'][variable.name]))
+                else:
+                    print('Freshly initializing %s since no saved value was found.' % variable.name)
+                    variables_to_initialize.append(variable)
+            for var_name in sorted(saved_data['weights']):
+                if var_name not in used_vars:
+                    if var_name.endswith('Adam:0') or var_name.endswith('Adam_1:0') or var_name in ['beta1_power:0', 'beta2_power:0']:
+                        continue
+                    print('Saved weights for %s not used by model.' % var_name)
+            restore_ops.append(tf.variables_initializer(variables_to_initialize))
+            model.sess.run(restore_ops)
+    return model
--- a/src/model_test.py
+++ b/src/model_test.py
@ -0,0 +1,260 @@
+from collections import defaultdict
+from itertools import chain
+from typing import Optional, List, Dict, Any, NamedTuple, Iterable, Tuple
+import logging
+import random
+
+from dpu_utils.mlutils import Vocabulary
+from dpu_utils.utils import RichPath
+import numpy as np
+from more_itertools import chunked, flatten
+from scipy.spatial.distance import cdist
+import wandb
+
+import model_restore_helper
+from models.model import get_data_files_from_directory, Model
+from dataextraction.python.parse_python_data import tokenize_python_from_string
+from dataextraction.utils import tokenize_docstring_from_string
+from dpu_utils.codeutils import split_identifier_into_parts
+
+
+def compute_ranks(src_representations: np.ndarray,
+                  tgt_representations: np.ndarray,
+                  distance_metric: str) -> Tuple[np.array, np.array]:
+    distances = cdist(src_representations, tgt_representations,
+                      metric=distance_metric)
+    # By construction the diagonal contains the correct elements
+    correct_elements = np.expand_dims(np.diag(distances), axis=-1)
+    return np.sum(distances <= correct_elements, axis=-1), distances
+
+
+class MrrSearchTester:
+    def __init__(self, model_path: RichPath, test_batch_size: int=1000, distance_metric: str='cosine',
+                 quiet: bool=False, hypers_override: Optional[Dict[str, Any]]=None) -> None:
+        self.__model = model_restore_helper.restore(path=model_path,
+                                                    is_train=False,
+                                                    hyper_overrides=hypers_override)
+        self.__quiet = quiet
+        self.__test_batch_size = test_batch_size
+        self.__distance_metric = distance_metric
+
+    @property
+    def model(self) -> Model:
+        return self.__model
+
+    @property
+    def test_batch_size(self)-> int:
+        return self.__test_batch_size
+
+    def update_test_batch_size(self, test_batch_size: int)-> None:
+        self.__test_batch_size = test_batch_size
+
+    QueryResult = NamedTuple('QueryResult', [
+        ('target_idx', int),
+        ('target_rank', int),
+        ('top_ranked_idxs', List[int])
+    ])
+
+    def evaluate(self, data: List[Dict[str, Any]], data_label_name: str,
+                 error_log: Optional[List['MrrSearchTester.QueryResult']]=None,
+                 error_log_rank_threshold: int=10,
+                 filter_language: Optional[str]=None)-> float:
+        """
+        Evaluate the MRR on the given dataset.
+
+        :param data: the data to test on.
+        :param data_label_name: A label used when printing the result output.
+        :param error_log: If not null, store in the log, results where the target rank is above the threshold.
+        :param error_log_rank_threshold: The threshold for logging into error_log (used only if error_log is not None)
+        :return: the mean reciprocal rank (MRR) score
+        """
+        assert len(data) > 0, 'data must have more than 0 rows.'
+        np.random.seed(0)  # set random seed so that random things are reproducible
+
+        if filter_language is None:
+            idxs = np.arange(len(data))
+        else:
+            idxs = np.array([i for i in range(len(data)) if data[i]['language'] == filter_language])
+        if len(idxs) == 0:
+            print('Warning: Trying to test on empty dataset. Skipping.')
+            return float('nan')
+        data = np.array(data, dtype=np.object)
+        np.random.shuffle(idxs)
+        data = data[idxs]
+
+        if len(data) < self.__test_batch_size:
+            logging.warning(f'the size of the total data {len(data):,} is less than the batch_size: {self.__test_batch_size:,} adjusting batch size to equal data size.')
+            self.update_test_batch_size(len(data))
+
+        def self_or_random_representation(representation: Optional[np.ndarray]) -> np.ndarray:
+            if representation is not None:
+                return representation
+            else:
+                return np.random.randn(self.__model.representation_size)
+
+        # Determine random sample of examples before chunking into batches.
+        # sample only from full batches
+        max_samples = 50
+        full_batch_len = len(data) // self.__test_batch_size * self.__test_batch_size
+        examples_sample = np.zeros(len(data), dtype=bool)
+        examples_sample[np.random.choice(np.arange(full_batch_len), replace=False, size=min(full_batch_len, max_samples))] = True
+        examples_table = []
+
+        sum_mrr = 0.0
+        num_batches = 0
+        batched_data = chunked(data, self.__test_batch_size)
+        batched_sample = chunked(examples_sample, self.__test_batch_size)
+        for batch_idx, (batch_data, batch_sample) in enumerate(zip(batched_data, batched_sample)):
+            if len(batch_data) < self.__test_batch_size:
+                break  # the last batch is smaller than the others, exclude.
+            num_batches += 1
+
+            code_representations = self.__model.get_code_representations(batch_data)
+            query_representations = self.__model.get_query_representations(batch_data)
+            assert len(code_representations) == len(query_representations) == self.__test_batch_size
+
+            # Construct numpy batch
+            num_uncomputed_representations = sum(1 for i in range(self.__test_batch_size)
+                                                 if code_representations[i] is None or query_representations[i] is None)
+            if num_uncomputed_representations > 0:
+                print(f'Ignoring {num_uncomputed_representations} samples whose representation could not be computed')
+
+            # Design decision: If a representation cannot be computed assign a random representation. This keeps
+            # the batch size identical across all models.
+            batch_code_representations = np.array(
+                [self_or_random_representation(code_representations[i]) for i in range(self.__test_batch_size)],
+                dtype=np.float32)
+            batch_query_representations = np.array(
+                [self_or_random_representation(query_representations[i]) for i in range(self.__test_batch_size)],
+                dtype=np.float32)
+
+            ranks, distances = compute_ranks(batch_code_representations,
+                                             batch_query_representations,
+                                             self.__distance_metric)
+
+            # Log example tables for a sample of rankings of queries for each dataset
+            if wandb.run:
+                examples_table_name = data_label_name.rstrip("-All")
+                examples_table_columns = ["Rank", "Language", "Query", "Code"]
+                for example, sample, rank in zip(batch_data, batch_sample, ranks):
+                    if not sample:
+                        continue
+                    language = example['language']
+                    markdown_code = "```%s\n" % language + example['code'].strip("\n") + "\n```"
+                    examples_table.append([rank, language, example['func_name'], markdown_code])
+
+            sum_mrr += np.mean(1.0 / ranks)
+
+            if error_log is not None:
+                batch_sample_idxs = idxs[batch_idx*self.__test_batch_size:(batch_idx+1)*self.__test_batch_size]
+                for i in range(len(ranks)):
+                    if ranks[i] >= error_log_rank_threshold:
+                        result = MrrSearchTester.QueryResult(
+                            target_idx=batch_sample_idxs[i],
+                            target_rank=ranks[i],
+                            top_ranked_idxs=batch_sample_idxs[np.argsort(distances[i])[:3]]
+                        )
+                        error_log.append(result)
+
+            if self.__quiet and batch_idx % 100 == 99:
+                print(f'Tested on {batch_idx + 1} batches so far.')
+
+        if wandb.run and examples_table:
+            wandb.log({"Examples-%s" % examples_table_name: wandb.Table(columns=examples_table_columns, rows=examples_table)})
+
+        eval_mrr = sum_mrr / num_batches
+        log_label = f'{data_label_name} MRR (bs={self.__test_batch_size:,})'
+        print(f'{log_label}: {eval_mrr: .3f}')
+        if wandb.run:
+            wandb.run.summary[f'{log_label}'] = eval_mrr
+        return eval_mrr
+
+
+def expand_data_path(data_path: str, azure_info_path: Optional[str]) -> List[RichPath]:
+    """
+    Args:
+        data_path: A path to either a file or a directory. If it's a file, we interpret it as a list of
+            data directories.
+
+    Returns:
+        List of data directories (potentially just data_path)
+    """
+    data_rpath = RichPath.create(data_path, azure_info_path)
+
+    if data_rpath.is_dir():
+        return [data_rpath]
+
+    return [RichPath.create(data_dir, azure_info_path)
+            for data_dir in data_rpath.read_as_text().splitlines()]
+
+
+def filter_untokenizable_code(data: Iterable[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Filter out data where field code_tokens is empty."""
+    return [d for d in data if d['code_tokens']]
+
+def log_row_count_diff(original_data: Iterable[Any], filtered_data:Iterable[Any], label: str) -> None:
+    """Compute the difference between row counts and log appropriately."""
+    original_row_count = len(list(original_data))
+    filtered_row_count = len(list(filtered_data))
+
+    assert original_row_count > 0, 'original_data does not contain any rows.'
+    assert filtered_row_count <= original_row_count, f'filtered_data {filtered_row_count:,} has a larger row count than original_data {original_row_count:,}.'
+
+    pcnt_parsed = filtered_row_count / original_row_count
+    print(f'{label}: parsed {filtered_row_count:,} out of {original_row_count:,} rows. ({pcnt_parsed*100:.1f}%)')
+    if wandb.run:
+        wandb.run.summary[f'{label} Parsed Pct'] = pcnt_parsed
+
+
+def get_dataset_from(data_dirs: List[RichPath], 
+                     use_func_names: bool=False, 
+                     max_files_per_dir: Optional[int] = None) -> List[Dict[str, Any]]:
+    data_files = sorted(get_data_files_from_directory(data_dirs, max_files_per_dir))
+    data = list(chain(*chain(list(f.read_by_file_suffix()) for f in data_files)))
+
+    if use_func_names:
+        # This task tries to match the function name to the code, by setting the function name as the query
+        for sample in data:
+            # Replace the query tokens with the function name, broken up into its sub-tokens:
+            sample['docstring_tokens'] = split_identifier_into_parts(sample['func_name'])
+
+            # In the code, replace the function name with the out-of-vocab token everywhere it appears:
+            sample['code_tokens'] = [Vocabulary.get_unk() if token == sample['func_name'] else token
+                                     for token in sample['code_tokens']]
+    return data
+
+
+def compute_evaluation_metrics(model_path: RichPath, arguments, 
+                               azure_info_path: str,
+                               valid_data_dirs: List[RichPath], 
+                               test_data_dirs: List[RichPath],
+                               max_files_per_dir: Optional[int] = None):
+
+    tester = MrrSearchTester(model_path, test_batch_size=int(arguments['--test-batch-size']),
+                                  distance_metric=arguments['--distance-metric'])
+    test_data = get_dataset_from(test_data_dirs, max_files_per_dir=max_files_per_dir)
+    # Get all languages in test_data
+    dataset_languages = set(d['language'] for d in test_data)
+    evaluation_sets = list((l, True) for l in dataset_languages)  # type: List[Tuple[str, bool]]
+    if set(tester.model.per_code_language_metadata.keys()) == dataset_languages:
+        evaluation_sets = [('All', False)] + evaluation_sets
+    final_eval = {}  # type: Dict[str, float]
+    for language_name, filter_language in evaluation_sets:
+        if filter_language and language_name not in tester.model.per_code_language_metadata:
+            continue
+        mrr = tester.evaluate(test_data, f'Test-{language_name}', filter_language=language_name if filter_language else None)
+        if language_name == "All":
+            final_eval['Primary MRR'] = mrr
+
+        # run test using the function name as the query
+        mrr = tester.evaluate(get_dataset_from(test_data_dirs, use_func_names=True, max_files_per_dir=max_files_per_dir), f'FuncNameTest-{language_name}',
+                              filter_language=language_name if filter_language else None)
+        if language_name == "All":
+            final_eval['FuncName MRR'] = mrr
+
+        # run the test procedure on the validation set (with same batch size as test, so that MRR is comparable)
+        tester.evaluate(get_dataset_from(valid_data_dirs, max_files_per_dir=max_files_per_dir), f'Validation-{language_name}',
+                        filter_language=language_name if filter_language else None)
+
+    if wandb.run and final_eval:
+        wandb.run.summary['Eval'] = final_eval
--- a/src/models/init.py
+++ b/src/models/init.py
@ -0,0 +1,6 @@
+from .model import Model
+from .nbow_model import NeuralBoWModel
+from .rnn_model import RNNModel
+from .self_att_model import SelfAttentionModel
+from .conv_model import ConvolutionalModel
+from .conv_self_att_model import ConvSelfAttentionModel
--- a/src/models/conv_model.py
+++ b/src/models/conv_model.py
@ -0,0 +1,35 @@
+from typing import Any, Dict, Optional
+
+from encoders import ConvolutionSeqEncoder
+from .model import Model
+
+
+class ConvolutionalModel(Model):
+    @classmethod
+    def get_default_hyperparameters(cls) -> Dict[str, Any]:
+        hypers = {}
+        for label in ["code", "query"]:
+            hypers.update({f'{label}_{key}': value
+                           for key, value in ConvolutionSeqEncoder.get_default_hyperparameters().items()})
+        model_hypers = {
+            'learning_rate': 5e-4,
+            'code_use_subtokens': False,
+            'code_mark_subtoken_end': False,
+            'batch_size': 1000,
+        }
+        hypers.update(super().get_default_hyperparameters())
+        hypers.update(model_hypers)
+        return hypers
+
+    def __init__(self,
+                 hyperparameters: Dict[str, Any],
+                 run_name: str = None,
+                 model_save_dir: Optional[str] = None,
+                 log_save_dir: Optional[str] = None):
+        super().__init__(
+            hyperparameters,
+            code_encoder_type=ConvolutionSeqEncoder,
+            query_encoder_type=ConvolutionSeqEncoder,
+            run_name=run_name,
+            model_save_dir=model_save_dir,
+            log_save_dir=log_save_dir)
--- a/src/models/conv_self_att_model.py
+++ b/src/models/conv_self_att_model.py
@ -0,0 +1,35 @@
+from typing import Any, Dict, Optional
+
+from encoders import ConvSelfAttentionEncoder
+from .model import Model
+
+
+class ConvSelfAttentionModel(Model):
+    @classmethod
+    def get_default_hyperparameters(cls) -> Dict[str, Any]:
+        hypers = {}
+        for label in ["code", "query"]:
+            hypers.update({f'{label}_{key}': value
+                           for key, value in ConvSelfAttentionEncoder.get_default_hyperparameters().items()})
+        model_hypers = {
+            'learning_rate': 5e-4,
+            'code_use_subtokens': False,
+            'code_mark_subtoken_end': False,
+            'batch_size': 650,
+        }
+        hypers.update(super().get_default_hyperparameters())
+        hypers.update(model_hypers)
+        return hypers
+
+    def __init__(self,
+                 hyperparameters: Dict[str, Any],
+                 run_name: str = None,
+                 model_save_dir: Optional[str] = None,
+                 log_save_dir: Optional[str] = None):
+        super().__init__(
+            hyperparameters,
+            code_encoder_type=ConvSelfAttentionEncoder,
+            query_encoder_type=ConvSelfAttentionEncoder,
+            run_name=run_name,
+            model_save_dir=model_save_dir,
+            log_save_dir=log_save_dir)
--- a/src/models/model.py
+++ b/src/models/model.py
@ -0,0 +1,943 @@
+import os
+import itertools
+import multiprocessing
+import random
+import time
+from abc import ABC, abstractmethod
+from collections import defaultdict, OrderedDict
+from enum import Enum, auto
+from typing import List, Dict, Any, Iterable, Tuple, Optional, Union, Callable, Type, DefaultDict
+
+import numpy as np
+import wandb
+import tensorflow as tf
+from dpu_utils.utils import RichPath
+
+from utils.py_utils import run_jobs_in_parallel
+from encoders import Encoder, QueryType
+
+
+LoadedSamples = Dict[str, List[Dict[str, Any]]]
+SampleId = Tuple[str, int]
+
+
+class RepresentationType(Enum):
+    CODE = auto()
+    QUERY = auto()
+
+
+def get_data_files_from_directory(data_dirs: List[RichPath],
+                                  max_files_per_dir: Optional[int] = None) -> List[RichPath]:
+    files = []  # type: List[str]
+    for data_dir in data_dirs:
+        dir_files = data_dir.get_filtered_files_in_dir('*.jsonl.gz')
+        if max_files_per_dir:
+            dir_files = sorted(dir_files)[:int(max_files_per_dir)]
+        files += dir_files
+
+    np.random.shuffle(files)  # This avoids having large_file_0, large_file_1, ... subsequences
+    return files
+
+
+def parse_data_file(hyperparameters: Dict[str, Any],
+                    code_encoder_class: Type[Encoder],
+                    per_code_language_metadata: Dict[str, Dict[str, Any]],
+                    query_encoder_class: Type[Encoder],
+                    query_metadata: Dict[str, Any],
+                    is_test: bool,
+                    data_file: RichPath) -> Dict[str, List[Tuple[bool, Dict[str, Any]]]]:
+    results: DefaultDict[str, List] = defaultdict(list)
+    for raw_sample in data_file.read_by_file_suffix():
+        sample: Dict = {}
+        language = raw_sample['language']
+        if language.startswith('python'):  # In some datasets, we use 'python-2.7' and 'python-3'
+            language = 'python'
+
+        # the load_data_from_sample method call places processed data into sample, and
+        # returns a boolean flag indicating if sample should be used
+        function_name = raw_sample.get('func_name')
+        use_code_flag = code_encoder_class.load_data_from_sample("code",
+                                                                 hyperparameters,
+                                                                 per_code_language_metadata[language],
+                                                                 raw_sample['code_tokens'],
+                                                                 function_name,
+                                                                 sample,
+                                                                 is_test)
+
+        use_query_flag = query_encoder_class.load_data_from_sample("query",
+                                                                   hyperparameters,
+                                                                   query_metadata,
+                                                                   [d.lower() for d in raw_sample['docstring_tokens']],
+                                                                   function_name,
+                                                                   sample,
+                                                                   is_test)
+        use_example = use_code_flag and use_query_flag
+        results[language].append((use_example, sample))
+    return results
+
+
+class Model(ABC):
+    @classmethod
+    @abstractmethod
+    def get_default_hyperparameters(cls) -> Dict[str, Any]:
+        return {
+                'batch_size': 200,
+
+                'optimizer': 'Adam',
+                'seed': 0,
+                'dropout_keep_rate': 0.9,
+                'learning_rate': 0.01,
+                'learning_rate_code_scale_factor': 1.,
+                'learning_rate_query_scale_factor': 1.,
+                'learning_rate_decay': 0.98,
+                'momentum': 0.85,
+                'gradient_clip': 1,
+                'loss': 'softmax',  # One of softmax, cosine, max-margin
+                'margin': 1,
+                'max_epochs': 500,
+                'patience': 5,
+
+                # Fraction of samples for which the query should be the function name instead of the docstring:
+                'fraction_using_func_name': 0.1,
+                # Only functions with a name at least this long will be candidates for training with the function name
+                # as the query instead of the docstring:
+                'min_len_func_name_for_query': 12,
+                # Frequency at which random, common tokens are added into the query:
+                'query_random_token_frequency': 0.,
+
+                # Maximal number of tokens considered to compute a representation for code/query:
+                'code_max_num_tokens': 200,
+                'query_max_num_tokens': 30,
+               }
+
+    def __init__(self,
+                 hyperparameters: Dict[str, Any],
+                 code_encoder_type: Type[Encoder],
+                 query_encoder_type: Type[Encoder],
+                 run_name: Optional[str] = None,
+                 model_save_dir: Optional[str] = None,
+                 log_save_dir: Optional[str] = None) -> None:
+        self.__code_encoder_type = code_encoder_type
+        self.__code_encoders: OrderedDict[str, Any] = OrderedDict()  # OrderedDict as we are using the order of languages a few times...
+
+        self.__query_encoder_type = query_encoder_type
+        self.__query_encoder: Any = None
+
+        # start with default hyper-params and then override them
+        self.hyperparameters = self.get_default_hyperparameters()
+        self.hyperparameters.update(hyperparameters)
+
+        self.__query_metadata: Dict[str, Any] = {}
+        self.__per_code_language_metadata: Dict[str, Any] = {}
+        self.__placeholders: Dict[str, Union[tf.placeholder, tf.placeholder_with_default]] = {}
+        self.__ops: Dict[str, Any] = {}
+        if run_name is None:
+            run_name = type(self).__name__
+        self.__run_name = run_name
+
+        if model_save_dir is None:
+            self.__model_save_dir = os.environ.get('PHILLY_MODEL_DIRECTORY', default='.')  # type: str
+        else:
+            self.__model_save_dir = model_save_dir  # type: str
+
+        if log_save_dir is None:
+            self.__log_save_dir = os.environ.get('PHILLY_LOG_DIRECTORY', default='.')  # type: str
+        else:
+            self.__log_save_dir = log_save_dir  # type: str
+
+        config = tf.ConfigProto()
+        config.gpu_options.allow_growth = True
+        if "gpu_device_id" in self.hyperparameters:
+            config.gpu_options.visible_device_list = str(self.hyperparameters["gpu_device_id"])
+
+        graph = tf.Graph()
+        self.__sess = tf.Session(graph=graph, config=config)
+
+        # save directory as tensorboard.
+        self.__tensorboard_dir = log_save_dir
+
+    @property
+    def query_metadata(self):
+        return self.__query_metadata
+
+    @property
+    def per_code_language_metadata(self):
+        return self.__per_code_language_metadata
+
+    @property
+    def placeholders(self):
+        return self.__placeholders
+
+    @property
+    def ops(self):
+        return self.__ops
+
+    @property
+    def sess(self):
+        return self.__sess
+
+    @property
+    def run_name(self):
+        return self.__run_name
+
+    @property
+    def representation_size(self) -> int:
+        return self.__query_encoder.output_representation_size
+
+    def _log_tensorboard_scalar(self, tag:str, value:float, step:int) -> None:
+        """Log scalar values that are not ops to tensorboard."""
+        summary = tf.Summary(value=[tf.Summary.Value(tag=tag,
+                                                     simple_value=value)])
+        self.__summary_writer.add_summary(summary, step)
+        self.__summary_writer.flush()
+
+    def save(self, path: RichPath) -> None:
+        variables_to_save = list(set(self.__sess.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
+        weights_to_save = self.__sess.run(variables_to_save)
+        weights_to_save = {var.name: value
+                           for (var, value) in zip(variables_to_save, weights_to_save)}
+
+        data_to_save = {
+                         "model_type": type(self).__name__,
+                         "hyperparameters": self.hyperparameters,
+                         "query_metadata": self.__query_metadata,
+                         "per_code_language_metadata": self.__per_code_language_metadata,
+                         "weights": weights_to_save,
+                         "run_name": self.__run_name,
+                       }
+
+        path.save_as_compressed_file(data_to_save)
+
+    def train_log(self, msg) -> None:
+        log_path = os.path.join(self.__log_save_dir,
+                                f'{self.run_name}.train_log')
+        with open(log_path, mode='a', encoding='utf-8') as f:
+            f.write(msg + "\n")
+        print(msg.encode('ascii', errors='replace').decode())
+
+    def test_log(self, msg) -> None:
+        log_path = os.path.join(self.__log_save_dir,
+                                f'{self.run_name}.test_log')
+        with open(log_path, mode='a', encoding='utf-8') as f:
+            f.write(msg + "\n")
+        print(msg.encode('ascii', errors='replace').decode())
+
+    def make_model(self, is_train: bool):
+        with self.__sess.graph.as_default():
+            random.seed(self.hyperparameters['seed'])
+            np.random.seed(self.hyperparameters['seed'])
+            tf.set_random_seed(self.hyperparameters['seed'])
+
+            self._make_model(is_train=is_train)
+            self._make_loss()
+            if is_train:
+                self._make_training_step()
+                self.__summary_writer = tf.summary.FileWriter(self.__tensorboard_dir, self.__sess.graph)
+
+    def _make_model(self, is_train: bool) -> None:
+        """
+        Create the actual model.
+
+        Note: This has to create self.ops['code_representations'] and self.ops['query_representations'],
+        tensors of the same shape and rank 2.
+        """
+        self.__placeholders['dropout_keep_rate'] = tf.placeholder(tf.float32,
+                                                                  shape=(),
+                                                                  name='dropout_keep_rate')
+        self.__placeholders['sample_loss_weights'] = \
+            tf.placeholder_with_default(input=np.ones(shape=[self.hyperparameters['batch_size']],
+                                                      dtype=np.float32),
+                                        shape=[self.hyperparameters['batch_size']],
+                                        name='sample_loss_weights')
+
+        with tf.variable_scope("code_encoder"):
+            language_encoders = []
+            for (language, language_metadata) in sorted(self.__per_code_language_metadata.items(), key=lambda kv: kv[0]):
+                with tf.variable_scope(language):
+                    self.__code_encoders[language] = self.__code_encoder_type(label="code",
+                                                                              hyperparameters=self.hyperparameters,
+                                                                              metadata=language_metadata)
+                    language_encoders.append(self.__code_encoders[language].make_model(is_train=is_train))
+            self.ops['code_representations'] = tf.concat(language_encoders, axis=0)
+        with tf.variable_scope("query_encoder"):
+            self.__query_encoder = self.__query_encoder_type(label="query",
+                                                             hyperparameters=self.hyperparameters,
+                                                             metadata=self.__query_metadata)
+            self.ops['query_representations'] = self.__query_encoder.make_model(is_train=is_train)
+
+        code_representation_size = next(iter(self.__code_encoders.values())).output_representation_size
+        query_representation_size = self.__query_encoder.output_representation_size
+        assert code_representation_size == query_representation_size, \
+            f'Representations produced for code ({code_representation_size}) and query ({query_representation_size}) cannot differ!'
+
+    def get_code_token_embeddings(self, language: str) -> Tuple[tf.Tensor, List[str]]:
+        with self.__sess.graph.as_default():
+            with tf.variable_scope("code_encoder"):
+                return self.__code_encoders[language].get_token_embeddings()
+
+    def get_query_token_embeddings(self) -> Tuple[tf.Tensor, List[str]]:
+        with self.__sess.graph.as_default():
+            with tf.variable_scope("query_encoder"):
+                return self.__query_encoder.get_token_embeddings()
+
+    def _make_loss(self) -> None:
+        if self.hyperparameters['loss'] == 'softmax':
+            logits = tf.matmul(self.ops['query_representations'],
+                               self.ops['code_representations'],
+                               transpose_a=False,
+                               transpose_b=True,
+                               name='code_query_cooccurrence_logits',
+                               )  # B x B
+
+            similarity_scores = logits
+
+            per_sample_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+                labels=tf.range(tf.shape(self.ops['code_representations'])[0]),  # [0, 1, 2, 3, ..., n]
+                logits=logits
+            )
+        elif self.hyperparameters['loss'] == 'cosine':
+            query_norms = tf.norm(self.ops['query_representations'], axis=-1, keep_dims=True) + 1e-10
+            code_norms = tf.norm(self.ops['code_representations'], axis=-1, keep_dims=True) + 1e-10
+            cosine_similarities = tf.matmul(self.ops['query_representations'] / query_norms,
+                                            self.ops['code_representations'] / code_norms,
+                                            transpose_a=False,
+                                            transpose_b=True,
+                                            name='code_query_cooccurrence_logits',
+                                            )  # B x B
+            similarity_scores = cosine_similarities
+
+            # A max-margin-like loss, but do not penalize negative cosine similarities.
+            neg_matrix = tf.diag(tf.fill(dims=[tf.shape(cosine_similarities)[0]], value=float('-inf')))
+            per_sample_loss = tf.maximum(0., self.hyperparameters['margin']
+                                             - tf.diag_part(cosine_similarities)
+                                             + tf.reduce_max(tf.nn.relu(cosine_similarities + neg_matrix),
+                                                             axis=-1))
+        elif self.hyperparameters['loss'] == 'max-margin':
+            logits = tf.matmul(self.ops['query_representations'],
+                               self.ops['code_representations'],
+                               transpose_a=False,
+                               transpose_b=True,
+                               name='code_query_cooccurrence_logits',
+                               )  # B x B
+            similarity_scores = logits
+            logprobs = tf.nn.log_softmax(logits)
+
+            min_inf_matrix = tf.diag(tf.fill(dims=[tf.shape(logprobs)[0]], value=float('-inf')))
+            per_sample_loss = tf.maximum(0., self.hyperparameters['margin']
+                                             - tf.diag_part(logprobs)
+                                             + tf.reduce_max(logprobs + min_inf_matrix, axis=-1))
+        elif self.hyperparameters['loss'] == 'triplet':
+            query_reps = self.ops['query_representations']  # BxD
+            code_reps = self.ops['code_representations']    # BxD
+
+            query_reps = tf.broadcast_to(query_reps, shape=[tf.shape(query_reps)[0], tf.shape(query_reps)[0],tf.shape(query_reps)[1]])  # B*xBxD
+            code_reps = tf.broadcast_to(code_reps, shape=[tf.shape(code_reps)[0], tf.shape(code_reps)[0],tf.shape(code_reps)[1]])  # B*xBxD
+            code_reps = tf.transpose(code_reps, perm=(1, 0, 2))  # BxB*xD
+
+            all_pair_distances = tf.norm(query_reps - code_reps, axis=-1)  # BxB
+            similarity_scores = -all_pair_distances
+
+            correct_distances = tf.expand_dims(tf.diag_part(all_pair_distances), axis=-1)  # Bx1
+
+            pointwise_loss = tf.nn.relu(correct_distances - all_pair_distances + self.hyperparameters['margin']) # BxB
+            pointwise_loss *= (1 - tf.eye(tf.shape(pointwise_loss)[0]))
+
+            per_sample_loss = tf.reduce_sum(pointwise_loss, axis=-1) / (tf.reduce_sum(tf.cast(tf.greater(pointwise_loss, 0), dtype=tf.float32), axis=-1) + 1e-10)  # B
+        else:
+            raise Exception(f'Unrecognized loss-type "{self.hyperparameters["loss"]}"')
+
+        per_sample_loss = per_sample_loss * self.placeholders['sample_loss_weights']
+        self.ops['loss'] = tf.reduce_sum(per_sample_loss) / tf.reduce_sum(self.placeholders['sample_loss_weights'])
+
+        # extract the logits from the diagonal of the matrix, which are the logits corresponding to the ground-truth
+        correct_scores = tf.diag_part(similarity_scores)
+        # compute how many queries have bigger logits than the ground truth (the diagonal) -> which will be incorrectly ranked
+        compared_scores = similarity_scores >= tf.expand_dims(correct_scores, axis=-1)
+        # for each row of the matrix (query), sum how many logits are larger than the ground truth
+        # ...then take the reciprocal of that to get the MRR for each individual query (you will need to take the mean later)
+        self.ops['mrr'] = 1 / tf.reduce_sum(tf.to_float(compared_scores), axis=1)
+
+    def _make_training_step(self) -> None:
+        """
+        Constructs self.ops['train_step'] from self.ops['loss'] and hyperparameters.
+        """
+        optimizer_name = self.hyperparameters['optimizer'].lower()
+        if optimizer_name == 'sgd':
+            optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.hyperparameters['learning_rate'])
+        elif optimizer_name == 'rmsprop':
+            optimizer = tf.train.RMSPropOptimizer(learning_rate=self.hyperparameters['learning_rate'],
+                                                  decay=self.hyperparameters['learning_rate_decay'],
+                                                  momentum=self.hyperparameters['momentum'])
+        elif optimizer_name == 'adam':
+            optimizer = tf.train.AdamOptimizer(learning_rate=self.hyperparameters['learning_rate'])
+        else:
+            raise Exception('Unknown optimizer "%s".' % (self.hyperparameters['optimizer']))
+
+        # Calculate and clip gradients
+        trainable_vars = self.sess.graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
+        gradients = tf.gradients(self.ops['loss'], trainable_vars)
+        clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.hyperparameters['gradient_clip'])
+        pruned_clipped_gradients = []
+        for (gradient, trainable_var) in zip(clipped_gradients, trainable_vars):
+            if gradient is None:
+                continue
+            if trainable_var.name.startswith("code_encoder/"):
+                gradient *= tf.constant(self.hyperparameters['learning_rate_code_scale_factor'],
+                                        dtype=tf.float32)
+            elif trainable_var.name.startswith("query_encoder/"):
+                gradient *= tf.constant(self.hyperparameters['learning_rate_query_scale_factor'],
+                                        dtype=tf.float32)
+
+            pruned_clipped_gradients.append((gradient, trainable_var))
+        self.ops['train_step'] = optimizer.apply_gradients(pruned_clipped_gradients)
+
+    def load_metadata(self, data_dirs: List[RichPath], max_files_per_dir: Optional[int] = None, parallelize: bool = True) -> None:
+        raw_query_metadata_list = []
+        raw_code_language_metadata_lists: DefaultDict[str, List] = defaultdict(list)
+
+        def metadata_parser_fn(_, file_path: RichPath) -> Iterable[Tuple[Dict[str, Any], Dict[str, Dict[str, Any]]]]:
+            raw_query_metadata = self.__query_encoder_type.init_metadata()
+            per_code_language_metadata: DefaultDict[str, Dict[str, Any]] = defaultdict(self.__code_encoder_type.init_metadata)
+
+            for raw_sample in file_path.read_by_file_suffix():
+                sample_language = raw_sample['language']
+                self.__code_encoder_type.load_metadata_from_sample(raw_sample['code_tokens'],
+                                                                   per_code_language_metadata[sample_language],
+                                                                   self.hyperparameters['code_use_subtokens'],
+                                                                   self.hyperparameters['code_mark_subtoken_end'])
+                self.__query_encoder_type.load_metadata_from_sample([d.lower() for d in raw_sample['docstring_tokens']],
+                                                                    raw_query_metadata)
+            yield (raw_query_metadata, per_code_language_metadata)
+
+        def received_result_callback(metadata_parser_result: Tuple[Dict[str, Any], Dict[str, Dict[str, Any]]]):
+            (raw_query_metadata, per_code_language_metadata) = metadata_parser_result
+            raw_query_metadata_list.append(raw_query_metadata)
+            for (metadata_language, raw_code_language_metadata) in per_code_language_metadata.items():
+                raw_code_language_metadata_lists[metadata_language].append(raw_code_language_metadata)
+
+        def finished_callback():
+            pass
+
+        if parallelize:
+            run_jobs_in_parallel(get_data_files_from_directory(data_dirs, max_files_per_dir),
+                                 metadata_parser_fn,
+                                 received_result_callback,
+                                 finished_callback)
+        else:
+            for (idx, file) in enumerate(get_data_files_from_directory(data_dirs, max_files_per_dir)):
+                for res in metadata_parser_fn(idx, file):
+                    received_result_callback(res)
+
+        self.__query_metadata = self.__query_encoder_type.finalise_metadata("query", self.hyperparameters, raw_query_metadata_list)
+        for (language, raw_per_language_metadata) in raw_code_language_metadata_lists.items():
+            self.__per_code_language_metadata[language] = \
+                self.__code_encoder_type.finalise_metadata("code", self.hyperparameters, raw_per_language_metadata)
+
+    def load_existing_metadata(self, metadata_path: RichPath):
+        saved_data = metadata_path.read_by_file_suffix()
+
+        hyper_names = set(self.hyperparameters.keys())
+        hyper_names.update(saved_data['hyperparameters'].keys())
+        for hyper_name in hyper_names:
+            old_hyper_value = saved_data['hyperparameters'].get(hyper_name)
+            new_hyper_value = self.hyperparameters.get(hyper_name)
+            if old_hyper_value != new_hyper_value:
+                self.train_log("I: Hyperparameter %s now has value '%s' but was '%s' when tensorising data."
+                               % (hyper_name, new_hyper_value, old_hyper_value))
+
+        self.__query_metadata = saved_data['query_metadata']
+        self.__per_code_language_metadata = saved_data['per_code_language_metadata']
+
+    def load_data_from_dirs(self, data_dirs: List[RichPath], is_test: bool,
+                            max_files_per_dir: Optional[int] = None,
+                            return_num_original_samples: bool = False, 
+                            parallelize: bool = True) -> Union[LoadedSamples, Tuple[LoadedSamples, int]]:
+        return self.load_data_from_files(data_files=list(get_data_files_from_directory(data_dirs, max_files_per_dir)),
+                                         is_test=is_test,
+                                         return_num_original_samples=return_num_original_samples,
+                                         parallelize=parallelize)
+
+    def load_data_from_files(self, data_files: Iterable[RichPath], is_test: bool,
+                             return_num_original_samples: bool = False, parallelize: bool = True) -> Union[LoadedSamples, Tuple[LoadedSamples, int]]:
+        tasks_as_args = [(self.hyperparameters,
+                          self.__code_encoder_type,
+                          self.__per_code_language_metadata,
+                          self.__query_encoder_type,
+                          self.__query_metadata,
+                          is_test,
+                          data_file)
+                         for data_file in data_files]
+
+        if parallelize:
+            with multiprocessing.Pool() as pool:
+                per_file_results = pool.starmap(parse_data_file, tasks_as_args)
+        else:
+            per_file_results = [parse_data_file(*task_args) for task_args in tasks_as_args]
+        samples: DefaultDict[str, List] = defaultdict(list)
+        num_all_samples = 0
+        for per_file_result in per_file_results:
+            for (language, parsed_samples) in per_file_result.items():
+                for (use_example, parsed_sample) in parsed_samples:
+                    num_all_samples += 1
+                    if use_example:
+                        samples[language].append(parsed_sample)
+        if return_num_original_samples:
+            return samples, num_all_samples
+        return samples
+
+    def __init_minibatch(self) -> Dict[str, Any]:
+        """
+        Returns:
+            An empty data holder for minibatch construction.
+        """
+        batch_data: Dict[str, Any] = dict()
+        batch_data['samples_in_batch'] = 0
+        batch_data['batch_finished'] = False
+        # This bit is a bit tricky. To keep the alignment between code and query bits, we need
+        # to keep everything separate here (including the query data). When we finalise a minibatch,
+        # we will join (concatenate) all the query info and send it to the query encoder. The
+        # language-specific bits get sent to the language-specific encoders, but are ordered such
+        # that concatenating the results of the code encoders gives us something that is aligned
+        # with the query encoder output.
+        batch_data['per_language_query_data'] = {}
+        batch_data['per_language_code_data'] = {}
+        for (language, language_encoder) in self.__code_encoders.items():
+            batch_data['per_language_query_data'][language] = {}
+            batch_data['per_language_query_data'][language]['query_sample_ids'] = []
+            self.__query_encoder.init_minibatch(batch_data['per_language_query_data'][language])
+            batch_data['per_language_code_data'][language] = {}
+            batch_data['per_language_code_data'][language]['code_sample_ids'] = []
+            language_encoder.init_minibatch(batch_data['per_language_code_data'][language])
+        return batch_data
+
+    def __extend_minibatch_by_sample(self,
+                                     batch_data: Dict[str, Any],
+                                     sample: Dict[str, Any],
+                                     language: str,
+                                     sample_id: SampleId,
+                                     include_query: bool = True,
+                                     include_code: bool = True,
+                                     is_train: bool = False) -> bool:
+        """
+        Extend a minibatch under construction by one sample.
+
+        Args:
+            batch_data: The minibatch data.
+            sample: The sample to add.
+            language: The (programming) language of the same to add.
+            sample_id: Unique identifier of the example.
+            include_code: Flag indicating if the code data needs to be added.
+            include_query: Flag indicating if the query data needs to be added.
+            is_train: Flag indicating if we are in train mode (which causes data augmentation)
+
+        Returns:
+            True iff the minibatch is full after this sample.
+        """
+        minibatch_is_full = False
+
+        # Train with some fraction of samples having their query set to the function name instead of the docstring, and
+        # their function name replaced with out-of-vocab in the code:
+        query_type = QueryType.DOCSTRING.value
+        if is_train and sample[f'query_tokens_{QueryType.FUNCTION_NAME.value}'] is not None and \
+                random.uniform(0., 1.) < self.hyperparameters['fraction_using_func_name']:
+            query_type = QueryType.FUNCTION_NAME.value
+
+        if include_query:
+            batch_data['per_language_query_data'][language]['query_sample_ids'].append(sample_id)
+            minibatch_is_full |= self.__query_encoder.extend_minibatch_by_sample(
+                batch_data['per_language_query_data'][language], sample, is_train=is_train, query_type=query_type)
+        if include_code:
+            batch_data['per_language_code_data'][language]['code_sample_ids'].append(sample_id)
+            minibatch_is_full |= self.__code_encoders[language].extend_minibatch_by_sample(
+                batch_data['per_language_code_data'][language], sample, is_train=is_train, query_type=query_type)
+        return minibatch_is_full or batch_data['samples_in_batch'] >= self.hyperparameters['batch_size']
+
+    def __minibatch_to_feed_dict(self,
+                                 batch_data: Dict[str, Any],
+                                 language_to_reweighting_factor: Optional[Dict[str, float]],
+                                 is_train: bool) -> Tuple[Dict[tf.Tensor, Any], List[SampleId]]:
+        """
+        Take a collected minibatch and turn it into something that can be fed directly to the constructed model
+
+        Args:
+            batch_data: The minibatch data (initialised by __init_minibatch and repeatedly filled by __extend_minibatch_by_sample)
+            language_to_reweighting_factor: Optional map from language to the language-specific weighting factor. If not present,
+              no reweighting will be performed.
+            is_train: Flag indicating if we are in train mode (to set dropout properly)
+
+        Returns:
+            A pair of a map from model placeholders to appropriate data structures and a list of sample ids
+            such that id_list[i] = id means that the i-th minibatch entry corresponds to the sample identified by id.
+        """
+        final_minibatch = {self.__placeholders['dropout_keep_rate']: self.hyperparameters['dropout_keep_rate'] if is_train else 1.0}
+
+        # Finalise the code representations while joining the query information:
+        full_query_batch_data: Dict[str, Any] = {'code_sample_ids': []}
+        language_weights = []
+        for (language, language_encoder) in self.__code_encoders.items():
+            language_encoder.minibatch_to_feed_dict(batch_data['per_language_code_data'][language], final_minibatch, is_train)
+            full_query_batch_data['code_sample_ids'].extend(batch_data['per_language_code_data'][language]['code_sample_ids'])
+
+            for (key, value) in batch_data['per_language_query_data'][language].items():
+                if key in full_query_batch_data:
+                    if isinstance(value, list):
+                        full_query_batch_data[key].extend(value)
+                    elif isinstance(value, int):
+                        full_query_batch_data[key] += value
+                    else:
+                        raise ValueError()
+                else:
+                    full_query_batch_data[key] = value
+            if language_to_reweighting_factor is not None:
+                language_weights.extend([language_to_reweighting_factor[language]] * len(batch_data['per_language_code_data'][language]['tokens']))
+
+        self.__query_encoder.minibatch_to_feed_dict(full_query_batch_data, final_minibatch, is_train)
+        if language_to_reweighting_factor is not None:
+            final_minibatch[self.__placeholders['sample_loss_weights']] = language_weights
+        if len(full_query_batch_data['query_sample_ids']) > 0:  # If we are only computing code representations, this will be empty
+            return final_minibatch, full_query_batch_data['query_sample_ids']
+        else:
+            return final_minibatch, full_query_batch_data['code_sample_ids']
+
+    def __split_data_into_minibatches(self,
+                                      data: LoadedSamples,
+                                      is_train: bool = False,
+                                      include_query: bool = True,
+                                      include_code: bool = True,
+                                      drop_incomplete_final_minibatch: bool = True,
+                                      compute_language_weightings: bool = False) \
+            -> Iterable[Tuple[Dict[tf.Tensor, Any], Any, int, List[SampleId]]]:
+        """
+        Take tensorised data and chunk into feed dictionaries corresponding to minibatches.
+
+        Args:
+            data: The tensorised input data.
+            is_train: Flag indicating if we are in train mode (which causes shuffling and the use of dropout)
+            include_query: Flag indicating if query data should be included.
+            include_code: Flag indicating if code data should be included.
+            drop_incomplete_final_minibatch: If True, all returned minibatches will have the configured size
+             and some examples from data may not be considered at all. If False, the final minibatch will
+             be shorter than the configured size.
+            compute_language_weightings: If True, produces weights for samples that normalise the loss
+             contribution of each language to be 1/num_languages.
+
+        Returns:
+            Iterable sequence of 4-tuples:
+              (1) A feed dict mapping placeholders to values,
+              (2) Number of samples in the batch
+              (3) Total number of datapoints processed
+              (4) List of IDs that connect the minibatch elements to the inputs. Concretely,
+                  element id_list[i] = (lang, j) indicates that the i-th result in the batch
+                  corresponds to the sample data[lang][j].
+        """
+        # We remove entries from language_to_num_remaining_samples once None are remaining:
+        language_to_num_remaining_samples, language_to_idx_list = {}, {}
+        for (language, samples) in data.items():
+            num_samples = len(samples)
+            language_to_num_remaining_samples[language] = num_samples
+            sample_idx_list = np.arange(num_samples)
+            if is_train:
+                np.random.shuffle(sample_idx_list)
+            language_to_idx_list[language] = sample_idx_list
+
+        if compute_language_weightings:
+            # We want to weigh languages equally, and thus normalise the loss of their samples with
+            # total_num_samples * 1/num_languages * 1/num_samples_per_language.
+            # Then, assuming a loss of 1 per sample for simplicity, the total loss attributed to a language is
+            #    \sum_{1 \leq i \leq num_samples_per_language} total_num_samples / (num_languages * num_samples_per_language)
+            #  =  num_samples_per_language * total_num_samples / (num_languages * num_samples_per_language)
+            #  =                             total_num_samples / num_languages
+            total_num_samples = sum(language_to_num_remaining_samples.values())
+            num_languages = len(language_to_num_remaining_samples)
+            language_to_reweighting_factor = {language: float(total_num_samples)/(num_languages * num_samples_per_language)
+                                              for (language, num_samples_per_language) in language_to_num_remaining_samples.items()}
+        else:
+            language_to_reweighting_factor = None  # type: ignore
+
+        total_samples_used = 0
+        batch_data = self.__init_minibatch()
+
+        while len(language_to_num_remaining_samples) > 0:
+            # Pick a language for the sample, by weighted sampling over the remaining data points:
+            remaining_languages = list(language_to_num_remaining_samples.keys())
+            total_num_remaining_samples = sum(language_to_num_remaining_samples.values())
+            picked_language = np.random.choice(a=remaining_languages,
+                                               p=[float(language_to_num_remaining_samples[lang]) / total_num_remaining_samples
+                                                  for lang in remaining_languages])
+
+            # Pick an example for the given language, and update counters:
+            picked_example_idx = language_to_num_remaining_samples[picked_language] - 1  # Note that indexing is 0-based and counting 1-based...
+            language_to_num_remaining_samples[picked_language] -= 1
+            if language_to_num_remaining_samples[picked_language] == 0:
+                del(language_to_num_remaining_samples[picked_language])  # We are done with picked_language now
+            picked_sample = data[picked_language][language_to_idx_list[picked_language][picked_example_idx]]
+
+            # Add the example to the current minibatch under preparation:
+            batch_data['samples_in_batch'] += 1
+            batch_finished = self.__extend_minibatch_by_sample(batch_data,
+                                                               picked_sample,
+                                                               language=picked_language,
+                                                               sample_id=(picked_language, language_to_idx_list[picked_language][picked_example_idx]),
+                                                               include_query=include_query,
+                                                               include_code=include_code,
+                                                               is_train=is_train
+                                                               )
+            total_samples_used += 1
+
+            if batch_finished:
+                feed_dict, original_sample_ids = self.__minibatch_to_feed_dict(batch_data, language_to_reweighting_factor, is_train)
+                yield feed_dict, batch_data['samples_in_batch'], total_samples_used, original_sample_ids
+                batch_data = self.__init_minibatch()
+
+        if not drop_incomplete_final_minibatch and batch_data['samples_in_batch'] > 0:
+            feed_dict, original_sample_ids = self.__minibatch_to_feed_dict(batch_data, language_to_reweighting_factor, is_train)
+            yield feed_dict, batch_data['samples_in_batch'], total_samples_used, original_sample_ids
+
+    def __run_epoch_in_batches(self, data: LoadedSamples, epoch_name: str, is_train: bool, quiet: bool = False) -> Tuple[float, float, float]:
+        """
+        Args:
+            data: Data to run on; will be broken into minibatches.
+            epoch_name: Name to use in CLI output.
+            is_train: Flag indicating if we should training ops (updating weights) as well.
+            quiet: Flag indicating that we should print only few lines (useful if not run in interactive shell)
+
+        Returns:
+            Triple of epoch loss (average over samples), MRR (average over batches), total time used for epoch (in s)
+        """
+        """Run the training ops and return the loss and the MRR."""
+        epoch_loss, loss = 0.0, 0.0
+        mrr_sum, mrr = 0.0, 0.0
+        epoch_start = time.time()
+        data_generator = self.__split_data_into_minibatches(data, is_train=is_train, compute_language_weightings=True)
+        samples_used_so_far = 0
+        printed_one_line = False
+        for minibatch_counter, (batch_data_dict, samples_in_batch, samples_used_so_far, _) in enumerate(data_generator):
+            if not quiet or (minibatch_counter % 100) == 99:
+                print("%s: Batch %5i (has %i samples). Processed %i samples. Loss so far: %.4f.  MRR so far: %.4f "
+                      % (epoch_name, minibatch_counter, samples_in_batch,
+                         samples_used_so_far - samples_in_batch, loss, mrr),
+                      flush=True,
+                      end="\r" if not quiet else '\n')
+                printed_one_line = True
+            ops_to_run = {'loss': self.__ops['loss'], 'mrr': self.__ops['mrr']}
+            if is_train:
+                ops_to_run['train_step'] = self.__ops['train_step']
+            op_results = self.__sess.run(ops_to_run, feed_dict=batch_data_dict)
+            assert not np.isnan(op_results['loss'])
+
+            epoch_loss += op_results['loss'] * samples_in_batch
+            mrr_sum += np.sum(op_results['mrr'])
+
+            loss = epoch_loss / max(1, samples_used_so_far)
+            mrr = mrr_sum / max(1, samples_used_so_far)
+
+            # additional training logs
+            if (minibatch_counter % 100) == 0 and is_train:
+                wandb.log({'train-loss': op_results['loss'],
+                           'train-mrr': op_results['mrr']})
+
+            minibatch_counter += 1
+
+        used_time = time.time() - epoch_start
+        if printed_one_line:
+            print("\r\x1b[K", end='')
+        self.train_log("  Epoch %s took %.2fs [processed %s samples/second]"
+                       % (epoch_name, used_time, int(samples_used_so_far/used_time)))
+
+        return loss, mrr, used_time
+
+    @property
+    def model_save_path(self) -> str:
+        return os.path.join(self.__model_save_dir,
+                            f'{self.run_name}_model_best.pkl.gz')
+
+    def train(self,
+              train_data: LoadedSamples,
+              valid_data: LoadedSamples,
+              azure_info_path: Optional[str],
+              quiet: bool = False,
+              resume: bool = False) -> RichPath:
+        model_path = RichPath.create(self.model_save_path, azure_info_path)
+        with self.__sess.as_default():
+            tf.set_random_seed(self.hyperparameters['seed'])
+            train_data_per_lang_nums = {language: len(samples) for language, samples in train_data.items()}
+            print('Training on %s samples.' % (", ".join("%i %s" % (num, lang) for (lang, num) in train_data_per_lang_nums.items())))
+            valid_data_per_lang_nums = {language: len(samples) for language, samples in valid_data.items()}
+            print('Validating on %s samples.' % (", ".join("%i %s" % (num, lang) for (lang, num) in valid_data_per_lang_nums.items())))
+
+            if resume:
+                # Variables should have been restored.
+                best_val_mrr_loss, best_val_mrr, _ = self.__run_epoch_in_batches(valid_data, "RESUME (valid)", is_train=False, quiet=quiet)
+                self.train_log('Validation Loss on Resume: %.6f' % (best_val_mrr_loss,))
+            else:
+                init_op = tf.variables_initializer(self.__sess.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))
+                self.__sess.run(init_op)
+                self.save(model_path)
+                best_val_mrr = 0
+            no_improvement_counter = 0
+            epoch_number = 0
+            while (epoch_number < self.hyperparameters['max_epochs']
+                   and no_improvement_counter < self.hyperparameters['patience']):
+
+                self.train_log('==== Epoch %i ====' % (epoch_number,))
+
+                # run training loop and log metrics
+                train_loss, train_mrr, train_time = self.__run_epoch_in_batches(train_data, "%i (train)" % (epoch_number,),
+                                                                                is_train=True,
+                                                                                quiet=quiet)
+                self.train_log(' Training Loss: %.6f' % (train_loss,))
+
+                # run validation calcs and log metrics
+                val_loss, val_mrr, val_time = self.__run_epoch_in_batches(valid_data, "%i (valid)" % (epoch_number,),
+                                                                          is_train=False,
+                                                                          quiet=quiet)
+                self.train_log(' Validation:  Loss: %.6f | MRR: %.6f' % (val_loss, val_mrr,))
+
+                log = {'epoch': epoch_number,
+                       'train-loss': train_loss,
+                       'train-mrr': train_mrr,
+                       'train-time-sec': train_time,
+                       'val-loss': val_loss,
+                       'val-mrr': val_mrr,
+                       'val-time-sec': val_time}
+                
+                # log to wandb
+                wandb.log(log)
+            
+                # log to tensorboard
+                for key in log:
+                    if key != 'epoch':
+                        self._log_tensorboard_scalar(tag=key, 
+                                                     value=log[key],
+                                                     step=epoch_number)
+
+                #  log the final epoch number
+                wandb.run.summary['epoch'] = epoch_number
+
+                if val_mrr > best_val_mrr:
+                    # save the best val_mrr encountered
+                    best_val_mrr_loss, best_val_mrr = val_loss, val_mrr
+
+                    wandb.run.summary['best_val_mrr_loss'] = best_val_mrr_loss
+                    wandb.run.summary['best_val_mrr'] = val_mrr
+                    wandb.run.summary['best_epoch'] = epoch_number
+
+                    no_improvement_counter = 0
+                    self.save(model_path)
+                    self.train_log("  Best result so far -- saved model as '%s'." % (model_path,))
+                else:
+                    # record epochs without improvement for early stopping
+                    no_improvement_counter += 1
+                epoch_number += 1
+
+            log_path = os.path.join(self.__log_save_dir,
+                                    f'{self.run_name}.train_log')
+            wandb.save(log_path)
+            tf.io.write_graph(self.__sess.graph,
+                              logdir=wandb.run.dir,
+                              name=f'{self.run_name}-graph.pbtxt')
+
+        self.__summary_writer.close()
+        return model_path
+
+    def __compute_representations_batched(self,
+                                          raw_data: List[Dict[str, Any]],
+                                          data_loader_fn: Callable[[Dict[str, Any], Dict[str, Any]], bool],
+                                          model_representation_op: tf.Tensor,
+                                          representation_type: RepresentationType) -> List[Optional[np.ndarray]]:
+        """Return a list of vector representation of each datapoint or None if the representation for that datapoint
+        cannot be computed.
+
+        Args:
+            raw_data: a list of raw data point as dictionanries.
+            data_loader_fn: A function f(in, out) that attempts to load/preprocess the necessary data from
+             in and store it in out, returning a boolean success value. If it returns False, the sample is
+             skipped and no representation is computed.
+            model_representation_op: An op in the computation graph that represents the desired
+             representations.
+            representation_type: type of the representation we are interested in (either code or query)
+
+        Returns:
+             A list of either a 1D numpy array of the representation of the i-th element in data or None if a
+             representation could not be computed.
+        """
+        tensorized_data = defaultdict(list)  # type: Dict[str, List[Dict[str, Any]]]
+        sample_to_tensorised_data_id = []  # type: List[Optional[SampleId]]
+        for raw_sample in raw_data:
+            language = raw_sample['language']
+            if language.startswith('python'):
+                language = 'python'
+            sample: Dict = {}
+            valid_example = data_loader_fn(raw_sample, sample)
+            if valid_example:
+                sample_to_tensorised_data_id.append((language, len(tensorized_data[language])))
+                tensorized_data[language].append(sample)
+            else:
+                sample_to_tensorised_data_id.append(None)
+        assert len(sample_to_tensorised_data_id) == len(raw_data)
+
+        data_generator = self.__split_data_into_minibatches(tensorized_data,
+                                                            is_train=False,
+                                                            include_query=representation_type == RepresentationType.QUERY,
+                                                            include_code=representation_type == RepresentationType.CODE,
+                                                            drop_incomplete_final_minibatch=False)
+
+        computed_representations = []
+        original_tensorised_data_ids = []  # type: List[SampleId]
+        for minibatch_counter, (batch_data_dict, samples_in_batch, samples_used_so_far, batch_original_tensorised_data_ids) in enumerate(data_generator):
+            op_results = self.__sess.run(model_representation_op, feed_dict=batch_data_dict)
+            computed_representations.append(op_results)
+            original_tensorised_data_ids.extend(batch_original_tensorised_data_ids)
+
+        computed_representations = np.concatenate(computed_representations, axis=0)
+        tensorised_data_id_to_representation_idx = {tensorised_data_id: repr_idx
+                                                    for (repr_idx, tensorised_data_id) in enumerate(original_tensorised_data_ids)}
+        reordered_representations: List = []
+        for tensorised_data_id in sample_to_tensorised_data_id:
+            if tensorised_data_id is None:
+                reordered_representations.append(None)
+            else:
+                reordered_representations.append(computed_representations[tensorised_data_id_to_representation_idx[tensorised_data_id]])
+        return reordered_representations
+
+    def get_query_representations(self, query_data: List[Dict[str, Any]]) -> List[Optional[np.ndarray]]:
+        def query_data_loader(sample_to_parse, result_holder):
+            function_name = sample_to_parse.get('func_name')
+            return self.__query_encoder_type.load_data_from_sample(
+                "query",
+                self.hyperparameters,
+                self.__query_metadata,
+                [d.lower() for d in sample_to_parse['docstring_tokens']],
+                function_name,
+                result_holder=result_holder,
+                is_test=True)
+
+        return self.__compute_representations_batched(query_data,
+                                                      data_loader_fn=query_data_loader,
+                                                      model_representation_op=self.__ops['query_representations'],
+                                                      representation_type=RepresentationType.QUERY)
+
+    def get_code_representations(self, code_data: List[Dict[str, Any]]) -> List[Optional[np.ndarray]]:
+        def code_data_loader(sample_to_parse, result_holder):
+            code_tokens = sample_to_parse['code_tokens']
+            language = sample_to_parse['language']
+            if language.startswith('python'):
+                language = 'python'
+
+            if code_tokens is not None:
+                function_name = sample_to_parse.get('func_name')
+                return self.__code_encoder_type.load_data_from_sample(
+                    "code",
+                    self.hyperparameters,
+                    self.__per_code_language_metadata[language],
+                    code_tokens,
+                    function_name,
+                    result_holder=result_holder,
+                    is_test=True)
+            else:
+                return False
+
+        return self.__compute_representations_batched(code_data,
+                                                      data_loader_fn=code_data_loader,
+                                                      model_representation_op=self.__ops['code_representations'],
+                                                      representation_type=RepresentationType.CODE)
--- a/src/models/nbow_model.py
+++ b/src/models/nbow_model.py
@ -0,0 +1,35 @@
+from typing import Any, Dict, Optional
+
+from encoders import NBoWEncoder
+from .model import Model
+
+
+class NeuralBoWModel(Model):
+    @classmethod
+    def get_default_hyperparameters(cls) -> Dict[str, Any]:
+        hypers = {}
+        for label in ["code", "query"]:
+            hypers.update({f'{label}_{key}': value
+                           for key, value in NBoWEncoder.get_default_hyperparameters().items()})
+        model_hypers = {
+            'code_use_subtokens': False,
+            'code_mark_subtoken_end': False,
+            'loss': 'cosine',
+            'batch_size': 1000
+        }
+        hypers.update(super().get_default_hyperparameters())
+        hypers.update(model_hypers)
+        return hypers
+
+    def __init__(self,
+                 hyperparameters: Dict[str, Any],
+                 run_name: str = None,
+                 model_save_dir: Optional[str] = None,
+                 log_save_dir: Optional[str] = None):
+        super().__init__(
+            hyperparameters,
+            code_encoder_type=NBoWEncoder,
+            query_encoder_type=NBoWEncoder,
+            run_name=run_name,
+            model_save_dir=model_save_dir,
+            log_save_dir=log_save_dir)
--- a/src/models/rnn_model.py
+++ b/src/models/rnn_model.py
@ -0,0 +1,34 @@
+from typing import Any, Dict, Optional
+
+from encoders import RNNEncoder
+from models import Model
+
+
+class RNNModel(Model):
+    @classmethod
+    def get_default_hyperparameters(cls) -> Dict[str, Any]:
+        hypers = {}
+        for label in ["code", "query"]:
+            hypers.update({f'{label}_{key}': value
+                           for key, value in RNNEncoder.get_default_hyperparameters().items()})
+        model_hypers = {
+            'code_use_subtokens': False,
+            'code_mark_subtoken_end': True,
+            'batch_size': 1000
+        }
+        hypers.update(super().get_default_hyperparameters())
+        hypers.update(model_hypers)
+        return hypers
+
+    def __init__(self,
+                 hyperparameters: Dict[str, Any],
+                 run_name: str = None,
+                 model_save_dir: Optional[str] = None,
+                 log_save_dir: Optional[str] = None):
+        super().__init__(
+            hyperparameters,
+            code_encoder_type=RNNEncoder,
+            query_encoder_type=RNNEncoder,
+            run_name=run_name,
+            model_save_dir=model_save_dir,
+            log_save_dir=log_save_dir)
--- a/src/models/self_att_model.py
+++ b/src/models/self_att_model.py
@ -0,0 +1,35 @@
+from typing import Any, Dict, Optional
+
+from encoders import SelfAttentionEncoder
+from models import Model
+
+
+class SelfAttentionModel(Model):
+    @classmethod
+    def get_default_hyperparameters(cls) -> Dict[str, Any]:
+        hypers = {}
+        for label in ["code", "query"]:
+            hypers.update({f'{label}_{key}': value
+                           for key, value in SelfAttentionEncoder.get_default_hyperparameters().items()})
+        model_hypers = {
+            'learning_rate': 5e-4,
+            'code_use_subtokens': False,
+            'code_mark_subtoken_end': False,
+            'batch_size': 450,
+        }
+        hypers.update(super().get_default_hyperparameters())
+        hypers.update(model_hypers)
+        return hypers
+
+    def __init__(self,
+                 hyperparameters: Dict[str, Any],
+                 run_name: str = None,
+                 model_save_dir: Optional[str] = None,
+                 log_save_dir: Optional[str] = None):
+        super().__init__(
+            hyperparameters,
+            code_encoder_type=SelfAttentionEncoder,
+            query_encoder_type=SelfAttentionEncoder,
+            run_name=run_name,
+            model_save_dir=model_save_dir,
+            log_save_dir=log_save_dir)
--- a/src/predict.py
+++ b/src/predict.py
@ -0,0 +1,130 @@
+#!/usr/bin/env python
+"""
+Run predictions on a CodeSearchNet model.
+
+Usage:
+    predict.py -m MODEL_FILE [-p PREDICTIONS_CSV]
+    predict.py -r RUN_ID     [-p PREDICTIONS_CSV]
+    predict.py -h | --help
+
+Options:
+    -h --help                       Show this screen
+    -m, --model_file FILENAME       Local path to a saved model file (filename.pkl.gz)
+    -r, --wandb_run_id RUN_ID       wandb run ID, [username]/codesearchnet/[hash string id], viewable from run overview page via info icon
+    -p, --predictions_csv FILENAME  CSV filename for model predictions (note: W&B benchmark submission requires the default name)
+                                    [default: ../resources/model_predictions.csv]
+
+Examples:
+    ./predict.py -r username/codesearchnet/0123456
+    ./predict.py -m ../resources/saved_models/neuralbowmodel-2019-10-31-12-00-00_model_best.pkl.gz
+"""
+
+"""
+This script tests a model on the CodeSearchNet Challenge, given
+- a particular model as a local file (-m, --model_file MODEL_FILENAME.pkl.gz), OR
+- as a Weights & Biases run id (-r, --wandb_run_id [username]/codesearchnet/0123456), which you can find
+on the /overview page or by clicking the 'info' icon on a given run.
+Run with "-h" to see full command line options.
+Note that this takes around 2 hours to make predictions on the baseline model.
+
+This script generates ranking results over the CodeSearchNet corpus for a given model by scoring their relevance
+(using that model) to 99 search queries of the CodeSearchNet Challenge. We use cosine distance between the learned 
+representations of the natural language queries and the code, which is stored in jsonlines files with this format:
+https://github.com/ml-msr-github/CodeSearchNet#preprocessed-data-format. The 99 challenge queries are located in 
+this file: https://github.com/ml-msr-github/CodeSearchNet/blob/master/resources/queries.csv. 
+To download the full CodeSearchNet corpus, see the README at the root of this repository.
+
+Note that this script is specific to methods and code in our baseline model and may not generalize to new models. 
+We provide it as a reference and in order to be transparent about our baseline submission to the CodeSearchNet Challenge.
+
+This script produces a CSV file of model predictions with the following fields: 'query', 'language', 'identifier', and 'url':
+      * language: the programming language for the given query, e.g. "python".  This information is available as a field in the data to be scored.
+      * query: the textual representation of the query, e.g. "int to string" .  
+      * identifier: this is an optional field that can help you track your data
+      * url: the unique GitHub URL to the returned results, e.g. "https://github.com/JamesClonk/vultr/blob/fed59ad207c9bda0a5dfe4d18de53ccbb3d80c91/cmd/commands.go#L12-L190". This information is available as a field in the data to be scored.
+
+The schema of the output CSV file constitutes a valid submission to the CodeSearchNet Challenge hosted on Weights & Biases. See further background and instructions on the submission process in the root README.
+
+The row order corresponds to the result ranking in the search task. For example, if in row 5 there is an entry for the Python query "read properties file", and in row 60 another result for the Python query "read properties file", then the URL in row 5 is considered to be ranked higher than the URL in row 60 for that query and language.
+"""
+
+import pickle
+import re
+import sys
+
+from annoy import AnnoyIndex
+from docopt import docopt
+from dpu_utils.utils import RichPath
+import pandas as pd
+from tqdm import tqdm
+import wandb
+
+from dataextraction.python.parse_python_data import tokenize_docstring_from_string
+import model_restore_helper
+
+def query_model(query, model, indices, language, topk=100):
+    query_embedding = model.get_query_representations([{'docstring_tokens': tokenize_docstring_from_string(query),
+                                                        'language': language}])[0]
+    idxs, distances = indices.get_nns_by_vector(query_embedding, topk, include_distances=True)
+    return idxs, distances
+
+
+if __name__ == '__main__':
+    args = docopt(__doc__)
+    
+    queries = pd.read_csv('../resources/queries.csv')
+    queries = list(queries['query'].values)
+
+    run_id = None
+    args_wandb_run_id = args.get('--wandb_run_id')
+    local_model_path = args.get('--model_file')
+    predictions_csv = args.get('--predictions_csv')
+
+    if args_wandb_run_id:
+        # validate format of runid:
+        if len(args_wandb_run_id.split('/')) != 3:
+            print("ERROR: Invalid wandb_run_id format: %s (Expecting: user/project/hash)" % args_wandb_run_id, file=sys.stderr)
+            sys.exit(1)
+        wandb_api = wandb.Api()
+        # retrieve saved model from W&B for this run
+        try:
+            run = wandb_api.run(args_wandb_run_id)
+        except wandb.CommError as e:
+            print("ERROR: Problem querying W&B for wandb_run_id: %s" % args_wandb_run_id, file=sys.stderr)
+            sys.exit(1)
+
+        model_file = [f for f in run.files() if f.name.endswith('gz')][0].download(replace=True)
+        local_model_path = model_file.name
+        run_id = args_wandb_run_id.split('/')[-1]
+
+    model_path = RichPath.create(local_model_path, None)
+    print("Restoring model from %s" % model_path)
+    model = model_restore_helper.restore(
+        path=model_path,
+        is_train=False,
+        hyper_overrides={})
+    
+    predictions = []
+    for language in ('python', 'go', 'javascript', 'java', 'php', 'ruby'):
+        print("Evaluating language: %s" % language)
+        definitions = pickle.load(open('../resources/data/{}_dedupe_definitions_v2.pkl'.format(language), 'rb'))
+        indexes = [{'code_tokens': d['function_tokens'], 'language': d['language']} for d in tqdm(definitions)]
+        code_representations = model.get_code_representations(indexes)
+
+        indices = AnnoyIndex(code_representations[0].shape[0])
+        for index, vector in tqdm(enumerate(code_representations)):
+            if vector is not None:
+                indices.add_item(index, vector)
+        indices.build(10)
+
+        for query in queries:
+            for idx, _ in zip(*query_model(query, model, indices, language)):
+                predictions.append((query, language, definitions[idx]['identifier'], definitions[idx]['url']))
+
+    df = pd.DataFrame(predictions, columns=['query', 'language', 'identifier', 'url'])
+    df.to_csv(predictions_csv, index=False)
+
+    if run_id:
+        # upload model predictions CSV file to W&B
+        wandb.init(id=run_id, resume="must")
+        wandb.save(predictions_csv)
--- a/src/relevanceeval.py
+++ b/src/relevanceeval.py
@ -0,0 +1,129 @@
+#!/usr/bin/env python
+"""
+Usage:
+    relevanceeval.py [options] RELEVANCE_ANNOTATIONS_CSV_PATH MODEL_PREDICTIONS_CSV
+
+Standalone relevance evaluation script that outputs evaluation statistics for a set of predictions of a given model.
+The input formats of the files is described below.
+
+The model predictions MODEL_PREDICTIONS_CSV file has the following format:
+    A comma-separated file with (at least) the fields and headers "language", "query", "url". Each row represents
+    a single result for a given query and a given programming language.
+
+    * language: the programming language for the given query, e.g. "python"
+    * query: the textual representation of the query, e.g. "int to string"
+    * url: the unique GitHub URL to the returned results, e.g. "https://github.com/JamesClonk/vultr/blob/fed59ad207c9bda0a5dfe4d18de53ccbb3d80c91/cmd/commands.go#L12-L190"
+
+     The order of the rows imply the ranking of the results in the search task. For example, if in row 5 there is
+     an entry for the Python query "read properties file" and then in row 60 appears another result for the
+     Python query "read properties file", then the URL in row 5 is considered to be ranked higher than the
+     URL in row 60 for that query and language.
+
+Options:
+    --debug                          Run in debug mode, falling into pdb on exceptions.
+    -h --help                        Show this screen.
+"""
+from collections import defaultdict
+from typing import Dict, List
+
+import numpy as np
+import pandas as pd
+from docopt import docopt
+from dpu_utils.utils import run_and_debug
+
+def load_relevances(filepath: str) -> Dict[str, Dict[str, Dict[str, float]]]:
+    relevance_annotations = pd.read_csv(filepath)
+    per_query_language = relevance_annotations.pivot_table(
+        index=['Query', 'Language', 'GitHubUrl'], values='Relevance', aggfunc=np.mean)
+
+    # Map language -> query -> url -> float
+    relevances = defaultdict(lambda: defaultdict(dict))  # type: Dict[str, Dict[str, Dict[str, float]]]
+    for (query, language, url), relevance in per_query_language['Relevance'].items():
+        relevances[language.lower()][query.lower()][url] = relevance
+    return relevances
+
+def load_predictions(filepath: str, max_urls_per_language: int=300) -> Dict[str, Dict[str, List[str]]]:
+    prediction_data = pd.read_csv(filepath)
+
+    # Map language -> query -> Ranked List of URL
+    predictions = defaultdict(lambda: defaultdict(list))
+    for _, row in prediction_data.iterrows():
+        predictions[row['language'].lower()][row['query'].lower()].append(row['url'])
+    for query_data in predictions.values():
+        for query, ranked_urls in query_data.items():
+            query_data[query] = ranked_urls[:max_urls_per_language]
+
+    return predictions
+
+def coverage_per_language(predictions: Dict[str, List[str]],
+                          relevance_scores: Dict[str, Dict[str, float]], with_positive_relevance: bool=False) -> float:
+    """
+    Compute the % of annotated URLs that appear in the algorithm's predictions.
+    """
+    num_annotations = 0
+    num_covered = 0
+    for query, url_data in relevance_scores.items():
+        urls_in_predictions = set(predictions[query])
+        for url, relevance in url_data.items():
+            if not with_positive_relevance or relevance > 0:
+                num_annotations += 1
+                if url in urls_in_predictions:
+                    num_covered += 1
+
+    return num_covered / num_annotations
+
+def ndcg(predictions: Dict[str, List[str]], relevance_scores: Dict[str, Dict[str, float]],
+         ignore_rank_of_non_annotated_urls: bool=True) -> float:
+    num_results = 0
+    ndcg_sum = 0
+
+    for query, query_relevance_annotations in relevance_scores.items():
+        current_rank = 1
+        query_dcg = 0
+        for url in predictions[query]:
+            if url in query_relevance_annotations:
+                query_dcg += (2**query_relevance_annotations[url] - 1) / np.log2(current_rank + 1)
+                current_rank += 1
+            elif not ignore_rank_of_non_annotated_urls:
+                current_rank += 1
+
+        query_idcg = 0
+        for i, ideal_relevance in enumerate(sorted(query_relevance_annotations.values(), reverse=True), start=1):
+            query_idcg += (2 ** ideal_relevance - 1) / np.log2(i + 1)
+        if query_idcg == 0:
+            # We have no positive annotations for the given query, so we should probably not penalize anyone about this.
+            continue
+        num_results += 1
+        ndcg_sum += query_dcg / query_idcg
+    return ndcg_sum / num_results
+
+
+
+def run(arguments):
+    relevance_scores = load_relevances(arguments['RELEVANCE_ANNOTATIONS_CSV_PATH'])
+    predictions = load_predictions(arguments['MODEL_PREDICTIONS_CSV'])
+
+    languages_predicted = sorted(set(predictions.keys()))
+
+    # Now Compute the various evaluation results
+    print('% of URLs in predictions that exist in the annotation dataset:')
+    for language in languages_predicted:
+        print(f'\t{language}: {coverage_per_language(predictions[language], relevance_scores[language])*100:.2f}%')
+
+    print('% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):')
+    for language in languages_predicted:
+        print(f'\t{language}: {coverage_per_language(predictions[language], relevance_scores[language], with_positive_relevance=True) * 100:.2f}%')
+
+    print('NDCG:')
+    for language in languages_predicted:
+        print(f'\t{language}: {ndcg(predictions[language], relevance_scores[language]):.3f}')
+
+    print('NDCG (full ranking):')
+    for language in languages_predicted:
+        print(f'\t{language}: {ndcg(predictions[language], relevance_scores[language], ignore_rank_of_non_annotated_urls=False):.3f}')
+
+
+
+if __name__ == '__main__':
+    args = docopt(__doc__)
+    run_and_debug(lambda: run(args), args['--debug'])
--- a/src/requirements.txt
+++ b/src/requirements.txt
@ -0,0 +1,11 @@
+docopt
+dpu_utils
+more-itertools
+numpy
+scipy
+SetSimilaritySearch
+wandb
+pygments
+parso
+pandas
+toolz
--- a/src/test.py
+++ b/src/test.py
@ -0,0 +1,45 @@
+#!/usr/bin/env python
+"""
+Usage:
+    test.py [options] MODEL_PATH VALID_DATA_PATH TEST_DATA_PATH
+    test.py [options] MODEL_PATH
+
+Standalone testing script
+
+Options:
+    -h --help                        Show this screen.
+    --test-batch-size SIZE           The size of the batches in which to compute MRR. [default: 1000]
+    --distance-metric METRIC         The distance metric to use [default: cosine]
+    --run-name NAME                  Picks a name for the trained model.
+    --quiet                          Less output (not one per line per minibatch). [default: False]
+    --dryrun                         Do not log run into logging database. [default: False]
+    --azure-info PATH                Azure authentication information file (JSON). Used to load data from Azure storage.
+    --sequential                     Do not parallelise data-loading. Simplifies debugging. [default: False]
+    --debug                          Enable debug routines. [default: False]
+"""
+from pathlib import Path
+
+from docopt import docopt
+from dpu_utils.utils import run_and_debug, RichPath
+
+import model_test as test
+
+
+def run(arguments):
+    azure_info_path = arguments.get('--azure-info', None)
+
+    # if you do not pass arguments for train/valid/test data default to files checked into repo.
+    if not arguments['VALID_DATA_PATH']:
+        dir_path = Path(__file__).parent.absolute()
+        print(dir_path)
+        arguments['VALID_DATA_PATH'] = str(dir_path / 'data_dirs_valid.txt')
+        arguments['TEST_DATA_PATH'] = str(dir_path / 'data_dirs_test.txt')
+
+    valid_data_dirs = test.expand_data_path(arguments['VALID_DATA_PATH'], azure_info_path)
+    test_data_dirs = test.expand_data_path(arguments['TEST_DATA_PATH'], azure_info_path)
+    test.compute_evaluation_metrics(RichPath.create(arguments['MODEL_PATH'], azure_info_path=azure_info_path),
+                                    arguments, azure_info_path, valid_data_dirs, test_data_dirs)
+
+if __name__ == '__main__':
+    args = docopt(__doc__)
+    run_and_debug(lambda: run(args), args['--debug'])
--- a/src/train.py
+++ b/src/train.py
@ -0,0 +1,191 @@
+#!/usr/bin/env python
+"""
+Usage:
+    train.py [options] SAVE_FOLDER TRAIN_DATA_PATH VALID_DATA_PATH TEST_DATA_PATH
+    train.py [options] [SAVE_FOLDER]
+
+*_DATA_PATH arguments may either accept (1) directory filled with .jsonl.gz files that we use as data,
+or a (2) plain text file containing a list of such directories (used for multi-language training).
+
+In the case that you supply a (2) plain text file, all directory names must be seperated by a newline.
+For example, if you want to read from multiple directories you might have a plain text file called
+data_dirs_train.txt with the below contents:
+
+> cat ~/src/data_dirs_train.txt
+azure://semanticcodesearch/pythondata/Processed_Data/jsonl/train
+azure://semanticcodesearch/csharpdata/split/csharpCrawl-train
+
+Options:
+    -h --help                        Show this screen.
+    --max-num-epochs EPOCHS          The maximum number of epochs to run [default: 300]
+    --max-files-per-dir INT          Maximum number of files per directory to load for training data.
+    --hypers-override HYPERS         JSON dictionary overriding hyperparameter values.
+    --hypers-override-file FILE      JSON file overriding hyperparameter values.
+    --model MODELNAME                Choose model type. [default: neuralbowmodel]
+    --test-batch-size SIZE           The size of the batches in which to compute MRR. [default: 1000]
+    --distance-metric METRIC         The distance metric to use [default: cosine]
+    --run-name NAME                  Picks a name for the trained model.
+    --quiet                          Less output (not one per line per minibatch). [default: False]
+    --dryrun                         Do not log run into logging database. [default: False]
+    --testrun                        Do a model run on a small dataset for testing purposes. [default: False]
+    --azure-info PATH                Azure authentication information file (JSON). Used to load data from Azure storage.
+    --evaluate-model PATH            Run evaluation on previously trained model.
+    --sequential                     Do not parallelise data-loading. Simplifies debugging. [default: False]
+    --debug                          Enable debug routines. [default: False]
+"""
+import json
+import os
+import sys
+import time
+from typing import Type, Dict, Any, Optional, List
+from pathlib import Path
+
+from docopt import docopt
+from dpu_utils.utils import RichPath, git_tag_run, run_and_debug
+import wandb
+
+import model_restore_helper
+from model_test import compute_evaluation_metrics
+from models.model import Model
+import model_test as test
+
+def run_train(model_class: Type[Model],
+              train_data_dirs: List[RichPath],
+              valid_data_dirs: List[RichPath],
+              save_folder: str,
+              hyperparameters: Dict[str, Any],
+              azure_info_path: Optional[str],
+              run_name: str,
+              quiet: bool = False,
+              max_files_per_dir: Optional[int] = None,
+              parallelize: bool = True) -> RichPath:
+    model = model_class(hyperparameters, run_name=run_name, model_save_dir=save_folder, log_save_dir=save_folder)
+    if os.path.exists(model.model_save_path):
+        model = model_restore_helper.restore(RichPath.create(model.model_save_path), is_train=True)
+        model.train_log("Resuming training run %s of model %s with following hypers:\n%s" % (run_name,
+                                                                                             model.__class__.__name__,
+                                                                                             str(hyperparameters)))
+        resume = True
+    else:
+        model.train_log("Tokenizing and building vocabulary for code snippets and queries.  This step may take several hours.")
+        model.load_metadata(train_data_dirs, max_files_per_dir=max_files_per_dir, parallelize=parallelize)
+        model.make_model(is_train=True)
+        model.train_log("Starting training run %s of model %s with following hypers:\n%s" % (run_name,
+                                                                                             model.__class__.__name__,
+                                                                                             str(hyperparameters)))
+        resume = False
+
+    philly_job_id = os.environ.get('PHILLY_JOB_ID')
+    if philly_job_id is not None:
+        # We are in Philly write out the model name in an auxiliary file
+        with open(os.path.join(save_folder, philly_job_id+'.job'), 'w') as f:
+            f.write(os.path.basename(model.model_save_path))
+    
+    wandb.config.update(model.hyperparameters)
+    model.train_log("Loading training and validation data.")
+    train_data = model.load_data_from_dirs(train_data_dirs, is_test=False, max_files_per_dir=max_files_per_dir, parallelize=parallelize)
+    valid_data = model.load_data_from_dirs(valid_data_dirs, is_test=False, max_files_per_dir=max_files_per_dir, parallelize=parallelize)
+    model.train_log("Begin Training.")
+    model_path = model.train(train_data, valid_data, azure_info_path, quiet=quiet, resume=resume)
+    return model_path
+
+
+def make_run_id(arguments: Dict[str, Any]) -> str:
+    """Choose a run ID, based on the --save-name parameter, the PHILLY_JOB_ID and the current time."""
+    philly_id = os.environ.get('PHILLY_JOB_ID')
+    if philly_id is not None:
+        return philly_id
+    user_save_name = arguments.get('--run-name')
+    if user_save_name is not None:
+        user_save_name = user_save_name[:-len('.pkl')] if user_save_name.endswith('.pkl') else user_save_name
+    else:
+        user_save_name = arguments['--model']
+    return "%s-%s" % (user_save_name, time.strftime("%Y-%m-%d-%H-%M-%S"))
+
+
+def run(arguments, tag_in_vcs=False) -> None:
+    azure_info_path = arguments.get('--azure-info', None)
+    testrun = arguments.get('--testrun')
+    max_files_per_dir=arguments.get('--max-files-per-dir')
+
+    dir_path = Path(__file__).parent.absolute()
+
+    # if you do not pass arguments for train/valid/test data default to files checked into repo.
+    if not arguments['TRAIN_DATA_PATH']:
+        arguments['TRAIN_DATA_PATH'] = str(dir_path/'data_dirs_train.txt')
+        arguments['VALID_DATA_PATH'] = str(dir_path/'data_dirs_valid.txt')
+        arguments['TEST_DATA_PATH'] = str(dir_path/'data_dirs_test.txt')
+
+    train_data_dirs = test.expand_data_path(arguments['TRAIN_DATA_PATH'], azure_info_path)
+    valid_data_dirs = test.expand_data_path(arguments['VALID_DATA_PATH'], azure_info_path)
+    test_data_dirs = test.expand_data_path(arguments['TEST_DATA_PATH'], azure_info_path)
+    
+    # default model save location
+    if not arguments['SAVE_FOLDER']:
+        arguments['SAVE_FOLDER'] =  str(dir_path.parent/'resources/saved_models/')
+
+    save_folder = arguments['SAVE_FOLDER']
+
+    model_class = model_restore_helper.get_model_class_from_name(arguments['--model'])
+
+    hyperparameters = model_class.get_default_hyperparameters()
+    run_name = make_run_id(arguments)
+
+    # make name of wandb run = run_id (Doesn't populate yet)
+    hyperparameters['max_epochs'] = int(arguments.get('--max-num-epochs'))
+
+    if testrun:
+        hyperparameters['max_epochs'] = 2
+        if not max_files_per_dir:
+            max_files_per_dir = 1
+
+    # override hyperparams if flag is passed
+    hypers_override = arguments.get('--hypers-override')
+    if hypers_override is not None:
+        hyperparameters.update(json.loads(hypers_override))
+    elif arguments.get('--hypers-override-file') is not None:
+        with open(arguments.get('--hypers-override-file')) as f:
+            hyperparameters.update(json.load(f))
+
+    os.makedirs(save_folder, exist_ok=True)
+
+    if tag_in_vcs:
+        hyperparameters['git_commit'] = git_tag_run(run_name)
+
+    # turns off wandb if you don't want to log anything
+    if arguments.get('--dryrun'):
+        os.environ["WANDB_MODE"] = 'dryrun'
+    # save hyperparams to logging
+    # must filter out type=set from logging when as that is not json serializable
+    wandb.init(name=run_name, config={k: v for k, v in hyperparameters.items() if not isinstance(v, set)})
+    wandb.config.update({'model-class': arguments['--model'],
+                         'train_folder': str(train_data_dirs),
+                         'valid_folder': str(valid_data_dirs),
+                         'save_folder': str(save_folder),
+                         'test_folder': str(test_data_dirs),
+                         'CUDA_VISIBLE_DEVICES': os.environ.get("CUDA_VISIBLE_DEVICES", 'Not Set'),
+                         'run-name': arguments.get('--run-name'),
+                         'CLI-command': ' '.join(sys.argv)})
+
+
+    if arguments.get('--evaluate-model'):
+        model_path = RichPath.create(arguments['--evaluate-model'])
+    else:
+        model_path = run_train(model_class, train_data_dirs, valid_data_dirs, save_folder, hyperparameters,
+                               azure_info_path, run_name, arguments['--quiet'],
+                               max_files_per_dir=max_files_per_dir,
+                               parallelize=not(arguments['--sequential']))
+
+    wandb.config['best_model_path'] = str(model_path)
+    wandb.save(str(model_path.to_local_path()))
+
+    # only limit files in test run if `--testrun` flag is passed by user.
+    if testrun:
+        compute_evaluation_metrics(model_path, arguments, azure_info_path, valid_data_dirs, test_data_dirs, max_files_per_dir)
+    else:
+        compute_evaluation_metrics(model_path, arguments, azure_info_path, valid_data_dirs, test_data_dirs)
+
+
+if __name__ == '__main__':
+    args = docopt(__doc__)
+    run_and_debug(lambda: run(args), args['--debug'])
--- a/src/utils/init.py
+++ b/src/utils/init.py
--- a/src/utils/bpevocabulary.py
+++ b/src/utils/bpevocabulary.py
@ -0,0 +1,215 @@
+## Code adapted from https://github.com/soaxelbrooke/python-bpe/blob/master/bpe/encoder.py
+## MIT License (see repository)
+
+
+""" An encoder which learns byte pair encodings for white-space separated text.  Can tokenize, encode, and decode. """
+import typing
+from typing import Optional
+from collections import Counter
+
+import toolz
+
+try:
+    from typing import Dict, Iterable, Callable, List, Any, Iterator
+except ImportError:
+    pass
+
+
+DEFAULT_EOW = '__eow'
+DEFAULT_SOW = '__sow'
+DEFAULT_UNK = '__unk'
+DEFAULT_PAD = '__pad'
+
+
+class BpeVocabulary(typing.Sized):
+    """
+    Encodes white-space separated text using byte-pair encoding.  See https://arxiv.org/abs/1508.07909 for details.
+    """
+
+    def __init__(self, vocab_size: int=8192, pct_bpe: float=0.2,
+                 ngram_min: int=2, ngram_max: int=8, required_tokens: Optional[Iterable[str]]=None, strict=True,
+                 EOW=DEFAULT_EOW, SOW=DEFAULT_SOW, UNK=DEFAULT_UNK, PAD=DEFAULT_PAD):
+        if vocab_size < 1:
+            raise ValueError('vocab size must be greater than 0.')
+
+        self.EOW = EOW
+        self.SOW = SOW
+        self.eow_len = len(EOW)
+        self.sow_len = len(SOW)
+        self.UNK = UNK
+        self.PAD = PAD
+        self.required_tokens = list(set(required_tokens or []).union({self.UNK, self.PAD}))
+        self.vocab_size = vocab_size
+        self.pct_bpe = pct_bpe
+        self.word_vocab_size = max([int(vocab_size * (1 - pct_bpe)), len(self.required_tokens or [])])
+        self.bpe_vocab_size = vocab_size - self.word_vocab_size
+        self.word_vocab = {}  # type: Dict[str, int]
+        self.bpe_vocab = {}  # type: Dict[str, int]
+        self.inverse_word_vocab = {}  # type: Dict[int, str]
+        self.inverse_bpe_vocab = {}  # type: Dict[int, str]
+        self.ngram_min = ngram_min
+        self.ngram_max = ngram_max
+        self.strict = strict
+
+    def __len__(self):
+        return self.vocab_size
+
+    def byte_pair_counts(self, words: Iterable[str]) -> Iterable[typing.Counter]:
+        """ Counts space separated token character pairs:
+            [('T h i s </w>', 4}] -> {'Th': 4, 'hi': 4, 'is': 4}
+        """
+        for token, count in self.count_tokens(words).items():
+            bp_counts = Counter()  # type: Counter
+            for ngram in token.split(' '):
+                bp_counts[ngram] += count
+            for ngram_size in range(self.ngram_min, min([self.ngram_max, len(token)]) + 1):
+                ngrams = [''.join(ngram) for ngram in toolz.sliding_window(ngram_size, token.split(' '))]
+
+                for ngram in ngrams:
+                    bp_counts[''.join(ngram)] += count
+
+            yield bp_counts
+
+    def count_tokens(self, words: Iterable[str]) -> Dict[str, int]:
+        """ Count tokens into a BPE vocab """
+        token_counts = Counter(words)
+        return {' '.join(token): count for token, count in token_counts.items()}
+
+    def learn_word_vocab(self, word_counts: typing.Counter[str]) -> Dict[str, int]:
+        """ Build vocab from self.word_vocab_size most common tokens in provided sentences """
+        for token in set(self.required_tokens or []):
+            word_counts[token] = int(2**31)
+        word_counts[self.PAD] = int(2**32)  # Make sure that PAD gets id=0
+        sorted_word_counts = sorted(word_counts.items(), key=lambda p: -p[1])
+        return {word: idx for idx, (word, count) in enumerate(sorted_word_counts[:self.word_vocab_size])}
+
+    def learn_bpe_vocab(self, words: Iterable[str]) -> Dict[str, int]:
+        """ Learns a vocab of byte pair encodings """
+        vocab = Counter()  # type: typing.Counter
+        for token in {self.SOW, self.EOW}:
+            vocab[token] = int(2**63)
+        for idx, byte_pair_count in enumerate(self.byte_pair_counts(words)):
+            for byte_pair, count in byte_pair_count.items():
+                vocab[byte_pair] += count
+
+            if (idx + 1) % 10000 == 0:
+                self.trim_vocab(10 * self.bpe_vocab_size, vocab)
+
+        sorted_bpe_counts = sorted(vocab.items(), key=lambda p: -p[1])[:self.bpe_vocab_size]
+        return {bp: idx + self.word_vocab_size for idx, (bp, count) in enumerate(sorted_bpe_counts)}
+
+    def fit(self, word_counts: typing.Counter[str]) -> None:
+        """ Learn vocab from text. """
+
+        # First, learn word vocab
+        self.word_vocab = self.learn_word_vocab(word_counts)
+
+        remaining_words = Counter({word: count for word, count in word_counts.items()
+                           if word not in self.word_vocab})
+        self.bpe_vocab = self.learn_bpe_vocab(remaining_words.elements())
+
+        self.inverse_word_vocab = {idx: token for token, idx in self.word_vocab.items()}
+        self.inverse_bpe_vocab = {idx: token for token, idx in self.bpe_vocab.items()}
+
+    @staticmethod
+    def trim_vocab(n: int, vocab: Dict[str, int]) -> None:
+        """  Deletes all pairs below 10 * vocab size to prevent memory problems """
+        pair_counts = sorted(vocab.items(), key=lambda p: -p[1])
+        pairs_to_trim = [pair for pair, count in pair_counts[n:]]
+        for pair in pairs_to_trim:
+            del vocab[pair]
+
+    def subword_tokenize(self, word: str) -> List[str]:
+        """ Tokenizes inside an unknown token using BPE """
+        end_idx = min([len(word), self.ngram_max])
+        sw_tokens = [self.SOW]
+        start_idx = 0
+
+        while start_idx < len(word):
+            subword = word[start_idx:end_idx]
+            if subword in self.bpe_vocab:
+                sw_tokens.append(subword)
+                start_idx = end_idx
+                end_idx = min([len(word), start_idx + self.ngram_max])
+            elif len(subword) == 1:
+                sw_tokens.append(self.UNK)
+                start_idx = end_idx
+                end_idx = min([len(word), start_idx + self.ngram_max])
+            else:
+                end_idx -= 1
+
+        sw_tokens.append(self.EOW)
+        return sw_tokens
+
+    def tokenize(self, word_tokens: List[str]) -> List[str]:
+        """ Split a sentence into word and subword tokens """
+
+        tokens = []
+        for word_token in word_tokens:
+            if word_token in self.word_vocab:
+                tokens.append(word_token)
+            else:
+                tokens.extend(self.subword_tokenize(word_token))
+
+        return tokens
+
+    def transform(self, sentences: Iterable[List[str]], reverse=False, fixed_length=None)-> Iterable[List[str]]:
+        """ Turns tokens into vocab idxs """
+        direction = -1 if reverse else 1
+        for sentence in sentences:
+            encoded = []
+            tokens = list(self.tokenize(sentence))
+            for token in tokens:
+                if token in self.word_vocab:
+                    encoded.append(self.word_vocab[token])
+                elif token in self.bpe_vocab:
+                    encoded.append(self.bpe_vocab[token])
+                else:
+                    encoded.append(self.word_vocab[self.UNK])
+
+            if fixed_length is not None:
+                encoded = encoded[:fixed_length]
+                while len(encoded) < fixed_length:
+                    encoded.append(self.word_vocab[self.PAD])
+
+            yield encoded[::direction]
+
+    def inverse_transform(self, rows: Iterable[List[int]]) -> Iterator[str]:
+        """ Turns token indexes back into space-joined text. """
+        for row in rows:
+            words = []
+
+            rebuilding_word = False
+            current_word = ''
+            for idx in row:
+                if self.inverse_bpe_vocab.get(idx) == self.SOW:
+                    if rebuilding_word and self.strict:
+                        raise ValueError('Encountered second SOW token before EOW.')
+                    rebuilding_word = True
+
+                elif self.inverse_bpe_vocab.get(idx) == self.EOW:
+                    if not rebuilding_word and self.strict:
+                        raise ValueError('Encountered EOW without matching SOW.')
+                    rebuilding_word = False
+                    words.append(current_word)
+                    current_word = ''
+
+                elif rebuilding_word and (idx in self.inverse_bpe_vocab):
+                    current_word += self.inverse_bpe_vocab[idx]
+
+                elif rebuilding_word and (idx in self.inverse_word_vocab):
+                    current_word += self.inverse_word_vocab[idx]
+
+                elif idx in self.inverse_word_vocab:
+                    words.append(self.inverse_word_vocab[idx])
+
+                elif idx in self.inverse_bpe_vocab:
+                    if self.strict:
+                        raise ValueError("Found BPE index {} when not rebuilding word!".format(idx))
+                    else:
+                        words.append(self.inverse_bpe_vocab[idx])
+
+                else:
+                    raise ValueError("Got index {} that was not in word or BPE vocabs!".format(idx))
+
+            yield ' '.join(w for w in words if w != '')
--- a/src/utils/embeddingvis.py
+++ b/src/utils/embeddingvis.py
@ -0,0 +1,82 @@
+#!/usr/bin/env python
+"""
+Usage:
+    embeddingvis.py [options] plot-tsne (--code | --query) MODEL_PATH
+    embeddingvis.py [options] print-nns (--code | --query) MODEL_PATH DISTANCE_THRESHOLD
+
+Options:
+    --azure-info=<path>        Azure authentication information file (JSON). Used to load data from Azure storage.
+    --distance-metric METRIC   The distance metric to use [default: cosine]
+    --num-nns NUM              The number of nearest neighbors to show when print-nns. [default: 2]
+    --lim-items NUM            Maximum number of items to use. Useful when memory is limited. [default: -1]
+    -h --help                  Show this screen.
+    --hypers-override HYPERS   JSON dictionary overriding hyperparameter values.
+    --language LANG            The code language to use. Only when --code option is given. [default: python]
+    --debug                    Enable debug routines. [default: False]
+"""
+from docopt import docopt
+from dpu_utils.utils import RichPath, run_and_debug
+from sklearn.manifold import TSNE
+import numpy as np
+from scipy.spatial.distance import pdist
+import matplotlib.pyplot as plt
+
+
+import model_restore_helper
+from utils.visutils import square_to_condensed
+
+
+def run(arguments) -> None:
+    azure_info_path = arguments.get('--azure-info', None)
+
+    model_path = RichPath.create(arguments['MODEL_PATH'], azure_info_path=azure_info_path)
+
+    model = model_restore_helper.restore(
+        path=model_path,
+        is_train=False)
+
+    if arguments['--query']:
+        embeddings, elements = model.get_query_token_embeddings()
+    else:
+        embeddings, elements = model.get_code_token_embeddings(arguments['--language'])
+
+    max_num_elements = int(arguments['--lim-items'])
+    if max_num_elements > 0:
+        embeddings, elements = embeddings[:max_num_elements], elements[:max_num_elements]
+
+    print(f'Collected {len(elements)} elements to visualize.')
+
+    embeddings = model.sess.run(fetches=embeddings)
+
+    if arguments['plot-tsne']:
+        emb_2d = TSNE(n_components=2, verbose=1, metric=arguments['--distance-metric']).fit_transform(embeddings)
+
+        plt.scatter(emb_2d[:, 0], emb_2d[:, 1])
+        for i in range(len(elements)):
+            plt.annotate(elements[i], xy=(emb_2d[i,0], emb_2d[i,1]))
+
+        plt.show()
+    elif arguments['print-nns']:
+        flat_distances = pdist(embeddings, arguments['--distance-metric'])
+        num_nns = int(arguments['--num-nns'])
+
+        for i, element in enumerate(elements):
+            distance_from_i = np.fromiter(
+                (flat_distances[square_to_condensed(i, j, len(elements))] if i != j else float('inf') for j in
+                 range(len(elements))), dtype=np.float)
+
+            nns = [int(k) for k in np.argsort(distance_from_i)[:num_nns]]  # The first two NNs
+
+            if distance_from_i[nns[0]] > float(arguments['DISTANCE_THRESHOLD']):
+                continue
+            try:
+                print(f'{element} --> ' + ', '.join(f'{elements[n]} ({distance_from_i[n]:.2f})' for n in nns))
+            except:
+                print('Error printing token for nearest neighbors pair.')
+
+
+
+
+if __name__ == '__main__':
+    args = docopt(__doc__)
+    run_and_debug(lambda: run(args), args.get('--debug', False))
--- a/src/utils/general_utils.py
+++ b/src/utils/general_utils.py
@ -0,0 +1,19 @@
+from typing import List, Any
+import pickle
+import pandas as pd
+
+
+def save_file_pickle(fname: str, obj: Any) -> None:
+    with open(fname, 'wb') as f:
+        pickle.dump(obj, f)
+
+
+def load_file_pickle(fname: str) -> None:
+    with open(fname, 'rb') as f:
+        obj = pickle.load(f)
+        return obj
+
+
+def chunkify(df: pd.DataFrame, n: int) -> List[pd.DataFrame]:
+    "turn pandas.dataframe into equal size n chunks."
+    return [df[i::n] for i in range(n)]
--- a/src/utils/jsonl2iddata.py
+++ b/src/utils/jsonl2iddata.py
@ -0,0 +1,32 @@
+#!/usr/bin/env python
+"""A utility tool for extracting the identifier tokens from existing .jsonl.gz data. Primarily used for exporting
+data for MSR's tool for dataset deduplication at https://github.com/Microsoft/near-duplicate-code-detector.
+
+Usage:
+    jsonl2iddata.py [options] INPUT_PATH OUTPUT_PATH
+
+Options:
+    -h --help                  Show this screen.
+    --azure-info=<path>        Azure authentication information file (JSON). Used to load data from Azure storage.
+    --debug                    Enable debug routines. [default: False]
+"""
+from docopt import docopt
+
+from dpu_utils.utils import run_and_debug, RichPath, ChunkWriter
+
+
+def run(arguments):
+    azure_info_path = arguments.get('--azure-info', None)
+    input_folder = RichPath.create(arguments['INPUT_PATH'], azure_info_path)
+    output_folder = RichPath.create(arguments['OUTPUT_PATH'], azure_info_path)
+
+    with ChunkWriter(output_folder, file_prefix='codedata', max_chunk_size=500, file_suffix='.jsonl.gz') as chunked_writer:
+        for file in input_folder.iterate_filtered_files_in_dir('*.jsonl.gz'):
+            for line in file.read_by_file_suffix():
+                tokens=line['code_tokens']
+                chunked_writer.add(dict(filename='%s:%s:%s' % (line['repo'], line['path'], line['lineno']), tokens=tokens))
+
+
+if __name__ == '__main__':
+    args = docopt(__doc__)
+    run_and_debug(lambda: run(args), args.get('--debug', False))
--- a/src/utils/nearestneighbor.py
+++ b/src/utils/nearestneighbor.py
@ -0,0 +1,113 @@
+#!/usr/bin/env python
+"""
+Usage:
+    nearestneighbor.py [options] (--code | --query | --both) MODEL_PATH DATA_PATH
+
+Options:
+    --azure-info=<path>        Azure authentication information file (JSON). Used to load data from Azure storage.
+    --distance-metric METRIC   The distance metric to use [default: cosine]
+    -h --help                  Show this screen.
+    --hypers-override HYPERS   JSON dictionary overriding hyperparameter values.
+    --debug                    Enable debug routines. [default: False]
+    --num-nns NUM              The number of NNs to visualize [default: 2]
+    --distance-threshold TH    The distance threshold above which to ignore [default: 0.2]
+    --max-num-items LIMIT      The maximum number of items to use. Use zero for all. [default: 0]
+"""
+import json
+from itertools import chain
+from typing import Any, Dict, List
+
+from docopt import docopt
+from dpu_utils.utils import RichPath, run_and_debug
+import numpy as np
+from more_itertools import take
+from scipy.spatial.distance import pdist
+
+from pygments import highlight
+from pygments.lexers import get_lexer_by_name
+from pygments.formatters import TerminalFormatter
+
+import model_restore_helper
+
+# Condensed to square from
+# http://stackoverflow.com/questions/13079563/how-does-condensed-distance-matrix-work-pdist
+from utils.visutils import square_to_condensed
+
+
+def to_string(code: str, language: str) -> str:
+    lexer = get_lexer_by_name(language, stripall=True)
+    formatter = TerminalFormatter(linenos=True)
+    return highlight(code, lexer, formatter)
+
+def run(arguments) -> None:
+    azure_info_path = arguments.get('--azure-info', None)
+    data_path = RichPath.create(arguments['DATA_PATH'], azure_info_path)
+    assert data_path.is_dir(), "%s is not a folder" % (data_path,)
+
+    hypers_override = arguments.get('--hypers-override')
+    if hypers_override is not None:
+        hypers_override = json.loads(hypers_override)
+    else:
+        hypers_override = {}
+
+    model_path = RichPath.create(arguments['MODEL_PATH'], azure_info_path=azure_info_path)
+
+    model = model_restore_helper.restore(
+        path=model_path,
+        is_train=False,
+        hyper_overrides=hypers_override)
+
+    num_elements_to_take = int(arguments['--max-num-items'])
+    data = chain(*chain(list(f.read_by_file_suffix()) for f in data_path.iterate_filtered_files_in_dir('*.jsonl.gz')))
+    if num_elements_to_take == 0:  # Take all
+        data = list(data)
+    else:
+        assert num_elements_to_take > 0
+        data = take(num_elements_to_take, data)
+
+
+    num_nns = int(arguments['--num-nns'])
+
+    if arguments['--code']:
+        representations = model.get_code_representations(data)
+    elif arguments['--query']:
+        representations = model.get_query_representations(data)
+    else:
+        code_representations = model.get_code_representations(data)
+        query_representations = model.get_query_representations(data)
+        representations = np.concatenate([code_representations, query_representations], axis=-1)
+
+    filtered_representations = []
+    filtered_data = []  # type: List[Dict[str, Any]]
+    for i, representation in enumerate(representations):
+        if representation is None:
+            continue
+        filtered_representations.append(representation)
+        filtered_data.append(data[i])
+
+    filtered_representations = np.stack(filtered_representations, axis=0)
+    flat_distances = pdist(filtered_representations, arguments['--distance-metric'])
+
+    for i, data in enumerate(filtered_data):
+        distance_from_i = np.fromiter(
+            (flat_distances[square_to_condensed(i, j, len(filtered_data))] if i != j else float('inf') for j in
+             range(len(filtered_data))), dtype=np.float)
+
+        nns = [int(k) for k in np.argsort(distance_from_i)[:num_nns]]  # The first two NNs
+
+        if distance_from_i[nns[0]] > float(arguments['--distance-threshold']):
+            continue
+
+        print('===============================================================')
+        print(f"{data['repo']}:{data['path']}:{data['lineno']}")
+        print(to_string(data['original_string'], language=data['language']))
+
+        for j in range(num_nns):
+            print()
+            print(f'Nearest Neighbour {j+1}: {filtered_data[nns[j]]["repo"]}:{filtered_data[nns[j]]["path"]}:{filtered_data[nns[j]]["lineno"]} (distance {distance_from_i[nns[j]]})')
+            print(to_string(filtered_data[nns[j]]['original_string'], language=filtered_data[nns[j]]['language']))
+
+
+if __name__ == '__main__':
+    args = docopt(__doc__)
+    run_and_debug(lambda: run(args), args.get('--debug', False))
--- a/src/utils/pkldf2jsonl.py
+++ b/src/utils/pkldf2jsonl.py
@ -0,0 +1,32 @@
+import pandas as pd
+from .general_utils import chunkify
+from dpu_utils.utils import RichPath
+from multiprocessing import Pool, cpu_count
+
+
+def df_to_jsonl(df: pd.DataFrame, RichPath_obj: RichPath, i: int, basefilename='codedata') -> str:
+    dest_filename = f'{basefilename}_{str(i).zfill(5)}.jsonl.gz'
+    RichPath_obj.join(dest_filename).save_as_compressed_file(df.to_dict(orient='records'))
+    return str(RichPath_obj.join(dest_filename))
+
+
+def chunked_save_df_to_jsonl(df: pd.DataFrame,
+                             output_folder: RichPath,
+                             num_chunks: int=None,
+                             parallel: bool=True) -> None:
+    "Chunk DataFrame (n chunks = num cores) and save as jsonl files."
+
+    df.reset_index(drop=True, inplace=True)
+    # parallel saving to jsonl files on azure
+    n = cpu_count() if num_chunks is None else num_chunks
+    dfs = chunkify(df, n)
+    args = zip(dfs, [output_folder]*len(dfs), range(len(dfs)))
+
+    if not parallel:
+        for arg in args:
+            dest_filename = df_to_jsonl(*arg)
+            print(f'Wrote chunk to {dest_filename}')
+    else:
+        with Pool(cpu_count()) as pool:
+            pool.starmap(df_to_jsonl, args)
+
--- a/src/utils/py_utils.py
+++ b/src/utils/py_utils.py
@ -0,0 +1,67 @@
+import multiprocessing
+from typing import List, Iterable, Callable, TypeVar
+
+
+JobType = TypeVar("JobType")
+ResultType = TypeVar("ResultType")
+
+
+def __parallel_queue_worker(worker_id: int,
+                            job_queue: multiprocessing.Queue,
+                            result_queue: multiprocessing.Queue,
+                            worker_fn: Callable[[int, JobType], Iterable[ResultType]]):
+    while True:
+        job = job_queue.get()
+
+        # "None" is the signal for last job, put that back in for other workers and stop:
+        if job is None:
+            job_queue.put(job)
+            break
+
+        for result in worker_fn(worker_id, job):
+            result_queue.put(result)
+    result_queue.put(None)
+
+
+def run_jobs_in_parallel(all_jobs: List[JobType],
+                         worker_fn: Callable[[int, JobType], Iterable[ResultType]],
+                         received_result_callback: Callable[[ResultType], None],
+                         finished_callback: Callable[[], None],
+                         result_queue_size: int=100) -> None:
+    """
+    Runs jobs in parallel and uses callbacks to collect results.
+    :param all_jobs: Job descriptions; one at a time will be parsed into worker_fn.
+    :param worker_fn: Worker function receiving a job; many copies may run in parallel.
+      Can yield results, which will be processed (one at a time) by received_result_callback.
+    :param received_result_callback: Called when a result was produced by any worker. Only one will run at a time.
+    :param finished_callback: Called when all jobs have been processed.
+    """
+    job_queue = multiprocessing.Queue(len(all_jobs) + 1)
+    for job in all_jobs:
+        job_queue.put(job)
+    job_queue.put(None)  # Marker that we are done
+
+    # This will hold the actual results:
+    result_queue = multiprocessing.Queue(result_queue_size)
+
+    # Create workers:
+    num_workers = multiprocessing.cpu_count() - 1
+    workers = [multiprocessing.Process(target=__parallel_queue_worker,
+                                       args=(worker_id, job_queue, result_queue, worker_fn))
+               for worker_id in range(num_workers)]
+    for worker in workers:
+        worker.start()
+
+    num_workers_finished = 0
+    while True:
+        result = result_queue.get()
+        if result is None:
+            num_workers_finished += 1
+            if num_workers_finished == len(workers):
+                finished_callback()
+                break
+        else:
+            received_result_callback(result)
+
+    for worker in workers:
+        worker.join()
--- a/src/utils/repo_helper.py
+++ b/src/utils/repo_helper.py
@ -0,0 +1,55 @@
+from subprocess import check_call
+from pathlib import Path
+from typing import List
+from json import loads
+from requests import get
+
+class Repo():
+    """
+    Because we don't have a good way to query the content of live repos.
+
+    Example usage:
+
+    # Instantiate object and retrieve code from repo: tensorflow/tensorflow
+    > rc = Repo(org='tensorflow', repo_name='tensorflow', dest_path='/some/existing/folder')
+    > rc.clone()  # gets the code by cloning if does not exist, or optionally pull the latest code.
+
+    # returns list of Path objects in repo that end in '.py'
+    > rc.get_filenames_with_extension('.py')
+    """
+
+    def __init__(self, org: str, repo_name: str, dest_path: str):
+        self.metadata = self.__get_metadata(org, repo_name)
+        assert Path(dest_path).is_dir(), f'Argument dest_path should be an existing directory: {dest_path}'
+        self.dest_path = Path(dest_path)
+        self.repo_save_folder = self.dest_path/self.metadata['full_name']
+
+    def __get_metadata(self, org: str, repo_name: str) -> dict:
+        "Validates github org and repo_name, and returns metadata about the repo."
+
+        resp = get(f'https://api.github.com/repos/{org}/{repo_name}')
+        resp.raise_for_status()
+        info = loads(resp.text or resp.content)
+        if 'clone_url' not in info:
+            raise Exception(f'Cannot find repository {org}/{repo_name}')
+        return info
+
+    def clone(self, refresh: bool=True) -> Path:
+        "Will clone a repo (default branch only) into the desired path, or if already exists will optionally pull latest code."
+        default_branch = self.metadata['default_branch']
+        clone_url = self.metadata['clone_url']
+
+        if not self.repo_save_folder.exists():
+            cmd = f'git clone --depth 1 -b {default_branch} --single-branch {clone_url} {str(self.repo_save_folder.absolute())}'
+            print(f'Cloning repo:\n {cmd}')
+            check_call(cmd, shell=True)
+
+        elif refresh:
+            cmd = f'git -C {str(self.repo_save_folder.absolute())} pull'
+            print(f'Pulling latest code from repo:\n {cmd}')
+            check_call(cmd, shell=True)
+
+    def get_filenames_with_extension(self, extension: str='.py') -> List[Path]:
+        "Return a list of filenames in the repo that end in the supplied extension."
+        files = self.repo_save_folder.glob('**/*')
+        return [f for f in files if f.is_file() and f.name.endswith(extension)]
--- a/src/utils/tfutils.py
+++ b/src/utils/tfutils.py
@ -0,0 +1,149 @@
+from typing import List, Tuple, Dict, Any, Optional, Union
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.ops.init_ops import Initializer
+
+from dpu_utils.mlutils import Vocabulary
+
+from utils.bpevocabulary import BpeVocabulary
+
+BIG_NUMBER = 1e7
+
+
+def convert_and_pad_token_sequence(token_vocab: Union[Vocabulary, BpeVocabulary],
+                                   token_sequence: List[str],
+                                   output_tensor_size: int,
+                                   pad_from_left: bool = False) \
+        -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Tensorise token sequence with padding; returning a mask for used elements as well.
+
+    Args:
+        token_vocab: Vocabulary or BPE encoder to use. We assume that token_vocab[0] is the padding symbol.
+        token_sequence: List of tokens in string form
+        output_tensor_size: Size of the resulting tensor (i.e., length up which we pad / down to which we truncate.
+        pad_from_left: Indicate if we are padding/truncating on the left side of string. [Default: False]
+
+    Returns:
+        Pair of numpy arrays. First is the actual tensorised token sequence, the second is a masking tensor
+        that is 1.0 for those token indices that are actually used.
+    """
+    if isinstance(token_vocab, BpeVocabulary):
+        token_ids = np.array(list(token_vocab.transform([token_sequence], fixed_length=output_tensor_size))[0])
+        token_mask = np.array([1 if token_ids[i] > 0 else 0 for i in range(len(token_ids))])
+        return token_ids, token_mask
+
+    if pad_from_left:
+        token_sequence = token_sequence[-output_tensor_size:]
+    else:
+        token_sequence = token_sequence[:output_tensor_size]
+
+    sequence_length = len(token_sequence)
+    if pad_from_left:
+        start_idx = output_tensor_size - sequence_length
+    else:
+        start_idx = 0
+
+    token_ids = np.zeros(output_tensor_size, dtype=np.int32)
+    token_mask = np.zeros(output_tensor_size, dtype=np.float32)
+    for i, token in enumerate(token_sequence, start=start_idx):
+        token_ids[i] = token_vocab.get_id_or_unk(token)
+        token_mask[i] = True
+
+    return token_ids, token_mask
+
+
+def write_to_feed_dict(feed_dict: Dict[tf.Tensor, Any], placeholder, val) -> None:
+    if len(val) == 0:
+        ph_shape = [dim if dim is not None else 0 for dim in placeholder.shape.as_list()]
+        feed_dict[placeholder] = np.empty(ph_shape)
+    else:
+        feed_dict[placeholder] = val
+
+
+class NoisyIdentityInitializer(Initializer):
+    def __init__(self, noise: float=1e-1):
+        self.__noise = noise
+        self.__identity_initializer = tf.initializers.identity()
+        self.__noise_initializer = tf.initializers.random_uniform(minval=-self.__noise, maxval=self.__noise)
+
+    def set_config(self):
+        return {
+            "noise": self.__noise,
+        }
+
+    def __call__(self, shape, dtype=None, partition_info=None):
+        identity = self.__identity_initializer(shape=shape, dtype=dtype, partition_info=partition_info)
+        noise = self.__noise_initializer(shape=shape, dtype=dtype, partition_info=partition_info)
+        return identity + noise
+
+
+def get_activation(activation_fun: Optional[str]):
+    if activation_fun is None:
+        return None
+    activation_fun = activation_fun.lower()
+    if activation_fun == 'linear':
+        return None
+    if activation_fun == 'tanh':
+        return tf.tanh
+    if activation_fun == 'relu':
+        return tf.nn.relu
+    if activation_fun == 'leaky_relu':
+        return tf.nn.leaky_relu
+    if activation_fun == 'elu':
+        return tf.nn.elu
+    if activation_fun == 'selu':
+        return tf.nn.selu
+    if activation_fun == 'gelu':
+        def gelu(input_tensor):
+            cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))
+            return input_tensor * cdf
+        return gelu
+    else:
+        raise ValueError("Unknown activation function '%s'!" % activation_fun)
+
+
+def pool_sequence_embedding(pool_mode: str,
+                            sequence_token_embeddings: tf.Tensor,
+                            sequence_lengths: tf.Tensor,
+                            sequence_token_masks: tf.Tensor) -> tf.Tensor:
+    """
+    Takes a batch of sequences of token embeddings and applies a pooling function,
+    returning one representation for each sequence.
+
+    Args:
+        pool_mode: The pooling mode, one of "mean", "max", "weighted_mean". For
+         the latter, a weight network is introduced that computes a score (from [0,1])
+         for each token, and embeddings are weighted by that score when computing
+         the mean.
+        sequence_token_embeddings: A float32 tensor of shape [B, T, D], where B is the
+         batch dimension, T is the maximal number of tokens per sequence, and D is
+         the embedding size.
+        sequence_lengths: An int32 tensor of shape [B].
+        sequence_token_masks: A float32 tensor of shape [B, T] with 0/1 values used
+         for masking out unused entries in sequence_embeddings.
+    Returns:
+        A tensor of shape [B, D], containing the pooled representation for each
+        sequence.
+    """
+    if pool_mode == 'mean':
+        seq_token_embeddings_masked = \
+            sequence_token_embeddings * tf.expand_dims(sequence_token_masks, axis=-1)  # B x T x D
+        seq_token_embeddings_sum = tf.reduce_sum(seq_token_embeddings_masked, axis=1)  # B x D
+        sequence_lengths = tf.expand_dims(tf.cast(sequence_lengths, dtype=tf.float32), axis=-1)  # B x 1
+        return seq_token_embeddings_sum / sequence_lengths
+    elif pool_mode == 'max':
+        sequence_token_masks = -BIG_NUMBER * (1 - sequence_token_masks)  # B x T
+        sequence_token_masks = tf.expand_dims(sequence_token_masks, axis=-1)  # B x T x 1
+        return tf.reduce_max(sequence_token_embeddings + sequence_token_masks, axis=1)
+    elif pool_mode == 'weighted_mean':
+        token_weights = tf.layers.dense(sequence_token_embeddings,
+                                        units=1,
+                                        activation=tf.sigmoid,
+                                        use_bias=False)  # B x T x 1
+        token_weights *= tf.expand_dims(sequence_token_masks, axis=-1)  # B x T x 1
+        seq_embedding_weighted_sum = tf.reduce_sum(sequence_token_embeddings * token_weights, axis=1)  # B x D
+        return seq_embedding_weighted_sum / (tf.reduce_sum(token_weights, axis=1) + 1e-8)  # B x D
+    else:
+        raise ValueError("Unknown sequence pool mode '%s'!" % pool_mode)
--- a/src/utils/visutils.py
+++ b/src/utils/visutils.py
@ -0,0 +1,5 @@
+def square_to_condensed(i, j, n):
+    assert i != j, "no diagonal elements in condensed matrix"
+    if i < j:
+        i, j = j, i
+    return int(n * j - j * (j + 1) / 2 + i - 1 - j)
--- a/src/wandb/settings
+++ b/src/wandb/settings
@ -0,0 +1,3 @@
+[default]
+project = CodeSearchNet
+
--- a/tests/data/csharp/csharp.jsonl.gz
+++ b/tests/data/csharp/csharp.jsonl.gz
--- a/tests/data/data_train.txt
+++ b/tests/data/data_train.txt
@ -0,0 +1,2 @@
+/tests/data/python
+/tests/data/csharp
--- a/tests/data/python/python.jsonl.gz
+++ b/tests/data/python/python.jsonl.gz