Learning and deploying counting grids service code

2019-06-18 17:54:38 -07:00 · 2019-06-18 17:54:38 -07:00 · 99f4d5367a
--- a/Batch/.vscode/settings.json
+++ b/Batch/.vscode/settings.json
@ -0,0 +1,5 @@
+{
+    "python.linting.pylintEnabled": true,
+    "python.linting.enabled": true,
+    "python.linting.lintOnSave": true,
+}
--- a/Batch/Batch/.gitignore
+++ b/Batch/Batch/.gitignore
@ -0,0 +1,292 @@
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
+
+src/keys.json
+src/metadata.json
+src/metadata_other.json
+
+# User-specific files
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+
+# Visual Studio 2015 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+# NUNIT
+*.VisualState.xml
+TestResult.xml
+
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+**/Properties/launchSettings.json
+
+*_i.c
+*_p.c
+*_i.h
+*.ilk
+*.meta
+*.obj
+*.pch
+*.pdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+
+# Chutzpah Test files
+_Chutzpah*
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+
+# TFS 2012 Local Workspace
+$tf/
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+
+# JustCode is a .NET coding add-in
+.JustCode
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+
+# Web workbench (sass)
+.sass-cache/
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# TODO: Comment the next line if you want to checkin your web deploy settings
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+
+# NuGet Packages
+*.nupkg
+# The packages folder can be ignored because of Package Restore
+**/packages/*
+# except build/, which is used as an MSBuild target.
+!**/packages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/packages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+
+# Microsoft Azure Emulator
+ecf/
+rcf/
+
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!*.[Cc]ache/
+
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+
+# Microsoft Fakes
+FakesAssemblies/
+
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+
+# Typescript v1 declaration files
+typings/
+
+# Visual Studio 6 build log
+*.plg
+
+# Visual Studio 6 workspace options file
+*.opt
+
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+
+# FAKE - F# Make
+.fake/
+
+# JetBrains Rider
+.idea/
+*.sln.iml
+
+# CodeRush
+.cr/
+
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+
+# Telerik's JustMock configuration file
+*.jmconfig
+
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
--- a/Batch/Batch/src/batchJob.py
+++ b/Batch/Batch/src/batchJob.py
@ -0,0 +1,34 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from jobStatus import JobStatus
+
+
+class BatchJob():
+    def __init__(self, id_in: str, jobStatus_in, progress_in: int):
+        self.id = id_in
+        self.jobStatus = jobStatus_in
+        self.progress = progress_in
+
+    def next(self):
+        if self.jobStatus.value in [JobStatus.NotStarted, JobStatus.PreProcessing, JobStatus.Training]:
+            self.jobStatus = JobStatus(self.jobStatus.value + 1)
+        else:
+            raise ValueError("Invalid job status value.")
+
+    def makeProgress(self, progress: int):
+        if progress < 0 or progress > 100:
+            raise ValueError("Invalid progress value.")
+        else:
+            self.progress = progress
+
+
+if __name__ == "__main__":
+    b = BatchJob("", JobStatus.NotStarted, 0, "", 5, 24)
+    b.next()
+    assert(b.jobStatus == JobStatus.PreProcessing)
+    b.makeProgress(5)
+    assert(b.progress == 5)
+
+    import json
+    print(json.dumps(b.__dict__))
--- a/Batch/Batch/src/browseCloudAzureUtilities.py
+++ b/Batch/Batch/src/browseCloudAzureUtilities.py
@ -0,0 +1,272 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import datetime
+import io
+import os
+import sys
+import time
+from datetime import timedelta
+import uuid
+
+import azure.storage.blob as azureblob
+import azure.batch.batch_service_client as batch
+import azure.batch.models as batchmodels
+
+sys.path.append('.')
+sys.path.append('..')
+
+
+def query_yes_no(question, default="yes"):
+    """
+    Prompts the user for yes/no input, displaying the specified question text.
+
+    :param str question: The text of the prompt for input.
+    :param str default: The default if the user hits <ENTER>. Acceptable values
+    are 'yes', 'no', and None.
+    :rtype: str
+    :return: 'yes' or 'no'
+    """
+    valid = {'y': 'yes', 'n': 'no'}
+    if default is None:
+        prompt = ' [y/n] '
+    elif default == 'yes':
+        prompt = ' [Y/n] '
+    elif default == 'no':
+        prompt = ' [y/N] '
+    else:
+        raise ValueError("Invalid default answer: '{}'".format(default))
+
+    while 1:
+        choice = input(question + prompt).lower()
+        if default and not choice:
+            return default
+        try:
+            return valid[choice[0]]
+        except (KeyError, IndexError):
+            print("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
+
+
+def print_batch_exception(batch_exception):
+    """
+    Prints the contents of the specified Batch exception.
+
+    :param batch_exception:
+    """
+    print('-------------------------------------------')
+    print('Exception encountered:')
+    if batch_exception.error and \
+            batch_exception.error.message and \
+            batch_exception.error.message.value:
+        print(batch_exception.error.message.value)
+        if batch_exception.error.values:
+            print()
+            for mesg in batch_exception.error.values:
+                print('{}:\t{}'.format(mesg.key, mesg.value))
+    print('-------------------------------------------')
+
+
+def download_file_from_container(block_blob_client, container_name, file_path, blob_name):
+    print("\nDownloading blob to " + file_path)
+    block_blob_client.get_blob_to_path(container_name, blob_name, file_path)
+
+
+def upload_file_to_container(block_blob_client, container_name, file_path):
+    """
+    Uploads a local file to an Azure Blob storage container.
+
+    :param block_blob_client: A blob service client.
+    :type block_blob_client: `azure.storage.blob.BlockBlobService`
+    :param str container_name: The name of the Azure Blob storage container.
+    :param str file_path: The local path to the file.
+    :rtype: `azure.batch.models.ResourceFile`
+    :return: A ResourceFile initialized with a SAS URL appropriate for Batch
+    tasks.
+    """
+    blob_name = os.path.basename(file_path)
+
+    print('Uploading file {} to container [{}]...'.format(file_path,
+                                                          container_name))
+
+    block_blob_client.create_blob_from_path(container_name,
+                                            blob_name,
+                                            file_path)
+
+    # Obtain the SAS token for the container.
+    sas_token = get_container_sas_token(block_blob_client,
+                                        container_name, azureblob.BlobPermissions.READ)
+
+    sas_url = block_blob_client.make_blob_url(container_name,
+                                              blob_name,
+                                              sas_token=sas_token)
+
+    return batchmodels.ResourceFile(file_path=blob_name,
+                                    blob_source=sas_url)
+
+
+def get_container_sas_token(block_blob_client,
+                            container_name, blob_permissions):
+    """
+    Obtains a shared access signature granting the specified permissions to the
+    container.
+
+    :param block_blob_client: A blob service client.
+    :type block_blob_client: `azure.storage.blob.BlockBlobService`
+    :param str container_name: The name of the Azure Blob storage container.
+    :param BlobPermissions blob_permissions:
+    :rtype: str
+    :return: A SAS token granting the specified permissions to the container.
+    """
+    # Obtain the SAS token for the container, setting the expiry time and
+    # permissions. In this case, no start time is specified, so the shared
+    # access signature becomes valid immediately. Expiration is in 2 hours.
+    container_sas_token = \
+        block_blob_client.generate_container_shared_access_signature(
+            container_name,
+            permission=blob_permissions,
+            expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=2))
+
+    return container_sas_token
+
+
+def get_container_sas_url(block_blob_client,
+                          container_name, blob_permissions):
+    """
+    Obtains a shared access signature URL that provides write access to the
+    ouput container to which the tasks will upload their output.
+
+    :param block_blob_client: A blob service client.
+    :type block_blob_client: `azure.storage.blob.BlockBlobService`
+    :param str container_name: The name of the Azure Blob storage container.
+    :param BlobPermissions blob_permissions:
+    :rtype: str
+    :return: A SAS URL granting the specified permissions to the container.
+    """
+    # Obtain the SAS token for the container.
+    sas_token = get_container_sas_token(block_blob_client,
+                                        container_name, azureblob.BlobPermissions.WRITE)
+
+    # Construct SAS URL for the container
+    container_sas_url = "https://{}.blob.core.windows.net/{}?{}".format(
+        _STORAGE_ACCOUNT_NAME, container_name, sas_token)
+
+    return container_sas_url
+
+
+def create_pool(batch_service_client, pool_id, vm_size, imageName, versions, auto_scale_formula):
+    """
+    Creates a pool of compute nodes with the specified OS settings.
+
+    :param batch_service_client: A Batch service client.
+    :type batch_service_client: `azure.batch.BatchServiceClient`
+    :param str pool_id: An ID for the new pool.
+    :param str publisher: Marketplace image publisher
+    :param str offer: Marketplace image offer
+    :param str sku: Marketplace image sky
+    """
+    print('Creating pool [{}]...'.format(pool_id))
+
+    new_pool = batch.models.PoolAddParameter(
+        id=pool_id,
+        virtual_machine_configuration=batchmodels.VirtualMachineConfiguration(
+            image_reference=batchmodels.ImageReference(
+                virtual_machine_image_id="/subscriptions/{}/resourceGroups/{}/providers/Microsoft.Compute/images/{}".format(
+                    "ad49354a-6ce2-4dae-a51d-b6907372f608", "BrowseCloud", imageName)
+            ),
+            node_agent_sku_id="batch.node.windows amd64"),
+        vm_size=vm_size,
+        start_task=None,
+        enable_auto_scale=True,
+        auto_scale_formula=auto_scale_formula,
+        application_package_references=[batchmodels.ApplicationPackageReference(
+            application_id="browsecloudtrainer", version=version) for version in versions],
+        auto_scale_evaluation_interval=timedelta(
+            minutes=5)  # the smallest evaluation interval
+
+    )
+
+    batch_service_client.pool.add(new_pool)
+
+
+def create_job(batch_service_client, job_id, pool_id):
+    """
+    Creates a job with the specified ID, associated with the specified pool.
+
+    :param batch_service_client: A Batch service client.
+    :type batch_service_client: `azure.batch.BatchServiceClient`
+    :param str job_id: The ID for the job.
+    :param str pool_id: The ID for the pool.
+    """
+    print('Creating job [{}]...'.format(job_id))
+
+    job = batch.models.JobAddParameter(
+        id=job_id,
+        pool_info=batch.models.PoolInformation(pool_id=pool_id))
+
+    batch_service_client.job.add(job)
+
+
+def add_training_task(batch_service_client, job_id, command, displayName, containerName, blobName):
+    """
+    Adds a task to the specified job.
+
+    :param batch_service_client: A Batch service client.
+    :type batch_service_client: `azure.batch.BatchServiceClient`
+    :param str jobId: The ID of the job to which to add the tasks.
+    :param command: commandLine activity to do
+    :param displayName display name of task
+    :param containerName name of container with training data
+    :param blobName name of training data blob
+    """
+
+    # sample  Linux command "/bin/bash -c \"ffmpeg -i {} {} \"
+    # sample  Windows command "cmd /c cd .. & cd C:\Users\chbuja"
+
+    GUID = str(uuid.uuid4())[:63]
+
+    autouser = batchmodels.AutoUserSpecification(
+        scope='task', elevation_level='admin')
+    userId = batchmodels.UserIdentity(auto_user=autouser)
+
+    tasks = list()
+    tasks.append(batch.models.TaskAddParameter(
+        id='{}'.format(GUID),
+        command_line=command,
+        display_name=displayName,
+        user_identity=userId
+    ))
+    batch_service_client.task.add_collection(job_id, tasks)
+
+
+def wait_for_tasks_to_complete(batch_service_client, job_id, timeout):
+    """
+    Returns when all tasks in the specified job reach the Completed state.
+
+    :param batch_service_client: A Batch service client.
+    :type batch_service_client: `azure.batch.BatchServiceClient`
+    :param str job_id: The id of the job whose tasks should be monitored.
+    :param timedelta timeout: The duration to wait for task completion. If all
+    tasks in the specified job do not reach Completed state within this time
+    period, an exception will be raised.
+    """
+    timeout_expiration = datetime.datetime.now() + timeout
+
+    print("Monitoring all tasks for 'Completed' state, timeout in {}..."
+          .format(timeout), end='')
+
+    while datetime.datetime.now() < timeout_expiration:
+        print('.', end='')
+        sys.stdout.flush()
+        tasks = batch_service_client.task.list(job_id)
+
+        incomplete_tasks = [task for task in tasks if
+                            task.state != batchmodels.TaskState.completed]
+        if not incomplete_tasks:
+            print()
+            return True
+        else:
+            time.sleep(1)
+
+    print()
+    raise RuntimeError("ERROR: Tasks did not reach 'Completed' state within "
+                       "timeout period of " + str(timeout))
--- a/Batch/Batch/src/browseCloudServiceAuthorizer.py
+++ b/Batch/Batch/src/browseCloudServiceAuthorizer.py
@ -0,0 +1,58 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import json
+import adal
+from msrestazure.azure_active_directory import AADTokenCredentials
+import requests
+
+
+class BrowseCloudServiceAuthorizer():
+    '''
+    Retrieves credentials for backend to send incremental results to the trainer.
+
+    Example:
+
+    url = 'https://contoso-browsecloud-service.azurewebsites.net/api/v1/documents'
+    bcsa = BrowseCloudServiceAuthorizer(resource_uri='https://contoso.oncontoso.com/BrowseCloud.Service')
+    headers = {'Content-Type': "application/json", 'Accept': "application/json", 'Authorization': bcsa.authOutput()}
+    res = requests.get(url, headers=headers)
+    print(res.text)
+    '''
+
+    def __init__(self, resource_uri: str):
+        self.token = ""
+        self.resource_uri = resource_uri
+
+    def retrieveToken(self):
+        """
+        Authenticate using service principal w/ key.
+        """
+        if self.resource_uri != "":
+
+            client_id = ""
+            client_secret = ""
+            authority_host_uri = ""
+            tenant = ""
+            # TODO: get this from key vault
+            with open("metadata.json", "r") as f:
+                data = json.load(f)
+                client_id = data['CLIENT_ID']
+                client_secret = data['CLIENT_SECRET']
+                authority_host_uri = data["AUTHORITY_HOST_URI"]
+                tenant = data["TENANT"]
+
+            authority_uri = authority_host_uri + '/' + tenant
+            resource_uri = self.resource_uri
+
+            context = adal.AuthenticationContext(
+                authority_uri, api_version=None)
+            self.token = context.acquire_token_with_client_credentials(
+                resource_uri, client_id, client_secret)
+            credentials = AADTokenCredentials(self.token, client_id)
+
+    # Returns:'Bearer <MY_TOKEN>'
+
+    def authOutput(self):
+        self.retrieveToken()
+        return 'Bearer ' + self.token['accessToken']
--- a/Batch/Batch/src/countingGridsHeartBeater.py
+++ b/Batch/Batch/src/countingGridsHeartBeater.py
@ -0,0 +1,54 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from jobStatus import JobStatus
+import threading
+import requests
+
+
+class CountingGridsHeartBeater():
+    def __init__(self, url, batchJob, bcServiceAuthorizer):
+        self.url = url
+        self.batchJob = batchJob
+        self.bcServiceAuthorizer = bcServiceAuthorizer
+        self.RESET_PROGRESS_NUMBER = 0
+        self.FINISHED_PROGRESS_NUMBER = 100
+
+    def workerFactory(self, batchJob, bcServiceAuthorizer):
+        url = self.url
+
+        def worker():
+            data = batchJob.__dict__
+            if url != '':
+                headers = {'Content-Type': "application/json", 'Accept': "application/json",
+                           'Authorization': self.bcServiceAuthorizer.authOutput()}
+                res = requests.put(self.url, json=data, headers=headers)
+                print(res)
+                print(res.text)
+            else:
+                print("Faking heartbeat with following batchJob:")
+                print(data)
+
+        return worker
+
+    def beat(self):
+        t = threading.Thread(target=self.workerFactory(
+            self.batchJob, self.bcServiceAuthorizer), args=())
+        t.setDaemon(False)
+        t.start()
+
+    def next(self):
+        self.batchJob.next()
+        self.batchJob.makeProgress(self.RESET_PROGRESS_NUMBER)
+        self.beat()
+
+    def makeProgress(self, progress: int):
+        self.batchJob.makeProgress(progress)
+        self.beat()
+
+    def done(self, success: bool):
+        if success:
+            self.batchJob.jobStatus = JobStatus.Success
+        else:
+            self.batchJob.jobStatus = JobStatus.Failure
+        self.makeProgress(self.FINISHED_PROGRESS_NUMBER)
--- a/Batch/Batch/src/deployBrowseCloudBatchPool.py
+++ b/Batch/Batch/src/deployBrowseCloudBatchPool.py
@ -0,0 +1,127 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from __future__ import print_function
+import datetime
+import io
+import os
+import sys
+import time
+from datetime import timedelta
+import json
+from browseCloudAzureUtilities import *
+
+import azure.storage.blob as azureblob
+import azure.batch.batch_service_client as batch
+import azure.batch.batch_auth as batchauth
+import azure.batch.models as batchmodels
+
+
+from azure.batch import BatchServiceClient
+from azure.common.credentials import ServicePrincipalCredentials
+
+sys.path.append('.')
+sys.path.append('..')
+
+
+# Update the Batch and Storage account credential strings below with the values
+# unique to your accounts. These are used when constructing connection strings
+# for the Batch and Storage client objects.
+
+_BATCH_ACCOUNT_NAME = ""
+_BATCH_ACCOUNT_KEY = ""
+_BATCH_ACCOUNT_URL = ""
+_SERVICE_PRINCIPAL_KEY = ""
+
+with open("keys.json", "r") as f:
+    data = json.load(f)
+    _BATCH_ACCOUNT_KEY = data['_BATCH_ACCOUNT_KEY']
+    _SERVICE_PRINCIPAL_KEY = data['_SERVICE_PRINCIPAL_KEY_DEV']
+
+
+if __name__ == '__main__':
+
+    start_time = datetime.datetime.now().replace(microsecond=0)
+    print('Sample start: {}'.format(start_time))
+    print()
+
+    # Create a Batch service client. We'll now be interacting with the Batch
+    # service in addition to Storage
+    '''
+    credentials = batchauth.SharedKeyCredentials(_BATCH_ACCOUNT_NAME,
+                                                 _BATCH_ACCOUNT_KEY)
+    '''
+    # Have separate applications, deployment locations, scaling formulae, etc., for dev and prod
+    # This is read from a file
+
+    JOB_ID = ""
+    POOL_VM_SIZE = ""
+    POOL_ID = ""
+    IMAGE_NAME = ""
+    SCALING_FORMULA = ""
+    VERSIONS = []
+    TENANT_ID = ""
+    CLIENT_ID = ""
+
+    with open('metadata.json', 'r') as fMeta:
+        dataMeta = json.load(fMeta)
+        TENANT_ID = dataMeta["TENANT_ID_DEPLOYMENT_AND_TESTING"]
+        CLIENT_ID = dataMeta["CLIENT_ID_DEPLOYMENT_AND_TESTING"]
+        _BATCH_ACCOUNT_NAME = dataMeta["_BATCH_ACCOUNT_NAME"]
+        _BATCH_ACCOUNT_URL = dataMeta["_BATCH_ACCOUNT_URL"]
+
+        if dataMeta['ENV'] == 'DEV':
+            JOB_ID = dataMeta['JOB_ID_DEV']
+            POOL_VM_SIZE = dataMeta['POOL_VM_SIZE_DEV']
+            POOL_ID = dataMeta['POOL_ID_DEV']
+            IMAGE_NAME = dataMeta['IMAGE_NAME_DEV']
+            SCALING_FORMULA = dataMeta['SCALING_FORMULA_DEV']
+            VERSIONS = dataMeta['VERSIONS_DEV']
+        elif dataMeta['ENV'] == 'PROD':
+            JOB_ID = dataMeta['JOB_ID_PROD']
+            POOL_VM_SIZE = dataMeta['POOL_VM_SIZE_PROD']
+            POOL_ID = dataMeta['POOL_ID_PROD']
+            IMAGE_NAME = dataMeta['IMAGE_NAME_PROD']
+            SCALING_FORMULA = dataMeta['SCALING_FORMULA_PROD']
+            VERSIONS = dataMeta['VERSIONS_PROD']
+        else:
+            raise ValueError("Environment type in metadata.json is invalid.")
+
+    SECRET = _SERVICE_PRINCIPAL_KEY
+
+    credentials = ServicePrincipalCredentials(
+        client_id=CLIENT_ID,
+        secret=SECRET,
+        tenant=TENANT_ID,
+        resource="https://batch.core.windows.net/"
+    )
+
+    batch_client = batch.BatchServiceClient(
+        credentials,
+        base_url=_BATCH_ACCOUNT_URL)
+
+    try:
+        # Create the pool that will contain the compute nodes that will execute the
+        # tasks.
+
+        create_pool(batch_client, POOL_ID, POOL_VM_SIZE,
+                    IMAGE_NAME, VERSIONS, SCALING_FORMULA)
+
+        # Create the job that will run the tasks.
+        create_job(batch_client, JOB_ID, POOL_ID)
+
+        print("  Success! All tasks reached the 'Completed' state within the "
+              "specified timeout period.")
+
+    except batchmodels.BatchErrorException as err:
+        print_batch_exception(err)
+        raise
+
+    # Print out some timing info
+    end_time = datetime.datetime.now().replace(microsecond=0)
+    print()
+    print('Sample end: {}'.format(end_time))
+    print('Elapsed time: {}'.format(end_time - start_time))
+    print()
+    print()
+    input('Press ENTER to exit...')
--- a/Batch/Batch/src/generateCountingGridsFromAzure.py
+++ b/Batch/Batch/src/generateCountingGridsFromAzure.py
@ -0,0 +1,229 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import hashlib
+import json
+import datetime
+import pandas as pd
+import os
+import numpy as np
+from CountingGridsPy.models import CountingGridModel
+import traceback
+from browseCloudServiceAuthorizer import BrowseCloudServiceAuthorizer
+from countingGridsHeartBeater import CountingGridsHeartBeater
+from jobStatus import JobStatus
+from batchJob import BatchJob
+import azure.storage.blob as azureblob
+from browseCloudAzureUtilities import upload_file_to_container, download_file_from_container
+import matplotlib.pyplot as plt
+from CountingGridsPy.EngineToBrowseCloudPipeline import BrowseCloudArtifactGenerator, CGEngineWrapper, NLPCleaner, PipelineTimer
+import sys
+sys.path.append("../../..")
+
+
+# CLI: python generateCountingGridsFromAzure.py <input_containername> <extent_size_of_grid_hyperparameter>
+# <window_size_of_grid_hyperparameter> <engine_type> <inputfile_type> <inputfile_name> <output_containername>
+#
+# Example CLI: python generateCountingGridsFromAzure.py trainingdata 24 5 numpyEngine simpleInput dictionaryVerySmallSample.txt bighash
+
+# Assumes: input_containername and output_containername must be <64 characters long.
+if __name__ == "__main__":
+    HEART_BEATER = None
+    try:
+        # ---------------------------------------------------------------------------------------
+        # Input
+        # ---------------------------------------------------------------------------------------
+
+        errStr = "Please give valid command-line arguments."
+        if len(sys.argv) != 8:
+            raise ValueError(errStr)
+
+        containerNameIn = sys.argv[1]
+        EXTENT_SIZE = int(sys.argv[2])
+        WINDOW_SIZE = int(sys.argv[3])
+        engine_type = sys.argv[4]
+        inputfile_type = sys.argv[5]
+        blobName = sys.argv[6]
+        containerNameOut = sys.argv[7]
+        if engine_type != "numpyEngine" and engine_type != "matlabEngine":
+            raise ValueError(
+                "The {0} engine does not exist. Please use 'matlabEngine' or 'numpyEngine'.".format(engine_type))
+        engine_type = engine_type[:-6]  # 6 characters in the word "Engine"
+        if inputfile_type != "metadataInput" and inputfile_type != "simpleInput":
+            raise ValueError(
+                "The {0} input type does not exist. Please use 'simpleInput' or 'metadataInput'.".format(inputfile_type))
+        inputfile_type = inputfile_type[:-5]  # remove "Input"
+
+        # ---------------------------------------------------------------------------------------
+        # Authentication
+        # ---------------------------------------------------------------------------------------
+        AUTH_URL = ""
+        SERVICE_URL = ""
+        _STORAGE_ACCOUNT_NAME_IN = 'browsecloudtrainingdata'
+        _STORAGE_ACCOUNT_KEY_IN = ""
+        _STORAGE_ACCOUNT_NAME_OUT = 'browsecloudmodelfiles'
+        _STORAGE_ACCOUNT_KEY_OUT = ""
+
+        jobId = containerNameOut
+        docId = containerNameIn
+        with open('metadata.json', 'r') as fMeta:
+            dataMeta = json.load(fMeta)
+            AUTH_URL = dataMeta["AUTH_URL"]
+            _STORAGE_ACCOUNT_NAME_IN = dataMeta["_STORAGE_ACCOUNT_NAME_TRAININGDATA"]
+            _STORAGE_ACCOUNT_KEY_OUT = dataMeta["_STORAGE_ACCOUNT_NAME_MODELS"]
+            if dataMeta['ENV'] == 'DEV':
+                # TODO: Use key vault and certificate
+                # to retreive that information instead of temp file for keys.
+                # Note that keys.json is not checked in.
+                with open("keys.json", "r") as f:
+                    data = json.load(f)
+                    _STORAGE_ACCOUNT_KEY_IN = data['_STORAGE_ACCOUNT_KEY_DEV']
+                    _STORAGE_ACCOUNT_KEY_OUT = data['_STORAGE_ACCOUNT_KEY_OUT_DEV']
+                    _STORAGE_ACCOUNT_NAME_OUT = data['_STORAGE_ACCOUNT_NAME_OUT_DEV'] if (
+                        '_STORAGE_ACCOUNT_NAME_OUT_DEV' in data
+                    ) else _STORAGE_ACCOUNT_NAME_OUT
+                    _STORAGE_ACCOUNT_NAME_IN = data['_STORAGE_ACCOUNT_NAME_IN_DEV'] if (
+                        '_STORAGE_ACCOUNT_NAME_IN_DEV' in data
+                    ) else _STORAGE_ACCOUNT_NAME_IN
+                    SERVICE_URL = dataMeta["SERVICE_URL_DEV"] + "/api/v1/jobs/" + \
+                        jobId if "SERVICE_URL_DEV" in dataMeta else SERVICE_URL
+            elif dataMeta['ENV'] == 'PROD':
+                with open("keys.json", "r") as f:
+                    data = json.load(f)
+                    _STORAGE_ACCOUNT_KEY_IN = data['_STORAGE_ACCOUNT_KEY_PROD']
+                    _STORAGE_ACCOUNT_KEY_OUT = data['_STORAGE_ACCOUNT_KEY_OUT_PROD']
+                    _STORAGE_ACCOUNT_NAME_OUT = data['_STORAGE_ACCOUNT_NAME_OUT_PROD'] if (
+                        '_STORAGE_ACCOUNT_NAME_OUT_PROD' in data
+                    ) else _STORAGE_ACCOUNT_NAME_OUT
+                    _STORAGE_ACCOUNT_NAME_IN = data['_STORAGE_ACCOUNT_NAME_IN_PROD'] if (
+                        '_STORAGE_ACCOUNT_NAME_IN_PROD' in data
+                    ) else _STORAGE_ACCOUNT_NAME_IN
+                    SERVICE_URL = dataMeta["SERVICE_URL_PROD"] + "/api/v1/jobs/" + \
+                        jobId if "SERVICE_URL_PROD" in dataMeta else SERVICE_URL
+            else:
+                raise ValueError(
+                    "Environment type in metadata.json is invalid.")
+
+        BATCH_JOB = BatchJob(jobId, JobStatus.NotStarted, 0)
+        SERVICE_AUTHORIZER = BrowseCloudServiceAuthorizer(AUTH_URL)
+        HEART_BEATER = CountingGridsHeartBeater(
+            SERVICE_URL, BATCH_JOB, SERVICE_AUTHORIZER)
+
+        DIRECTORY_SUFFIX = hashlib.sha3_256(
+            (docId+blobName+str(datetime.datetime.now())).encode()).hexdigest()
+        DIRECTORY_DATA = blobName.split(".")[0] + "_" + DIRECTORY_SUFFIX
+
+        if not os.path.isdir(DIRECTORY_DATA):
+            os.mkdir(DIRECTORY_DATA)
+
+        '''
+        Algorithm:
+            1.  Get training data from Azure.
+            2.  Run learning code.
+            3.  Write model files to Azure.
+
+
+        Changes between this and dumpCountingGrids.py:
+        1. DIRECTORY_DIR must be unique.
+        2. Fetching and writing to Azure. Idea is to fetch into directory and then write it to Azure
+        '''
+        blob_client = azureblob.BlockBlobService(
+            account_name=_STORAGE_ACCOUNT_NAME_IN,
+            account_key=_STORAGE_ACCOUNT_KEY_IN)
+
+        download_file_from_container(
+            blob_client, containerNameIn, DIRECTORY_DATA+"/"+blobName, blobName)
+
+        FILE_NAME = DIRECTORY_DATA + "\\" + blobName
+
+        CLEAN_DATA_FILE_NAME, MIN_FREQUENCY, MIN_WORDS = [
+            "\cg-processed.tsv", 2, 5]
+
+        # ---------------------------------------------------------------------------------------
+        # Data Cleaning
+        # ---------------------------------------------------------------------------------------
+
+        cleaner = NLPCleaner()
+        HEART_BEATER.next()
+        correspondences = None
+        CACHED_CORRESPONDENCES_FILE_NAME = "\cached_correspondences.tsv"
+        pTimer = PipelineTimer()
+        pTimer("Reading data file.")
+        df, keep = cleaner.read(
+            FILE_NAME, inputfile_type, MIN_FREQUENCY, MIN_WORDS)
+        if not (os.path.exists(DIRECTORY_DATA + CACHED_CORRESPONDENCES_FILE_NAME) and os.path.exists(DIRECTORY_DATA + CLEAN_DATA_FILE_NAME)):
+            pTimer("Starting data cleaning.")
+            cleaner.handle_negation_tokens()
+            cleaner.removePunctuation()
+            HEART_BEATER.makeProgress(50)
+            correspondences = cleaner.lemmatize()
+            cleaner.write_cached_correspondences(
+                DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
+            cleaner.write(DIRECTORY_DATA, CLEAN_DATA_FILE_NAME)
+        else:
+            pTimer("Skipping data cleaning.")
+            correspondences = cleaner.read_cached_correspondences(
+                DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
+
+        pTimer("Learning counting grid.")
+        LEARNED_GRID_FILE_NAME = "/CountingGridDataMatrices.mat"
+
+        # ---------------------------------------------------------------------------------------
+        # Learning
+        # ---------------------------------------------------------------------------------------
+        engine = CGEngineWrapper(
+            extent_size=EXTENT_SIZE, window_size=WINDOW_SIZE, heartBeaters=[HEART_BEATER])
+        HEART_BEATER.next()
+        vocabulary = None
+        if not os.path.exists(DIRECTORY_DATA + LEARNED_GRID_FILE_NAME):
+            vocabulary, keep = engine.fit(
+                DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, cleaner.labelsS, MIN_FREQUENCY, keep, engine=engine_type)
+        else:
+            vocabulary, keep = engine.get_vocab(
+                DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, cleaner.labelsS, MIN_FREQUENCY, keep)
+
+
+        # ---------------------------------------------------------------------------------------
+        # Output
+        # ---------------------------------------------------------------------------------------
+        pTimer("Generating counting grid artifacts.")
+        LINK_FILE_NAME = ""
+        bcag = BrowseCloudArtifactGenerator(DIRECTORY_DATA)
+        bcag.read(LEARNED_GRID_FILE_NAME)
+        bcag.write_docmap(engine.wd_size, engine=engine_type)
+        bcag.write_counts()
+        bcag.write_vocabulary(vocabulary)
+        bcag.write_top_pi()
+        bcag.write_top_pi_layers()
+        bcag.write_colors()  # write default blue colors
+        bcag.write_database(df, keep)
+        bcag.write_correspondences(correspondences, vocabulary)
+        bcag.write_keep(keep)
+
+        pTimer("Done.")
+
+        blob_client = azureblob.BlockBlobService(
+            account_name=_STORAGE_ACCOUNT_NAME_OUT,
+            account_key=_STORAGE_ACCOUNT_KEY_OUT)
+
+        # apply some recursion to create multiple container once uniqueness is problem
+        blob_client.create_container(containerNameOut)
+
+        # upload each file, aside from the input file into
+        FILES = [f.path for f in os.scandir(DIRECTORY_DATA) if not f.is_dir()]
+        for modelfile in FILES:
+            if not modelfile.endswith(blobName):
+                upload_file_to_container(
+                    blob_client, containerNameOut, modelfile)
+    except Exception as e:
+        HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
+        print("Script failed.")
+        print(traceback.format_exc())
+    except:
+        HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
+        print("Script failed.")
+        print(traceback.format_exc())
+
+    else:
+        HEART_BEATER.done(success=True)
+        print("Script succeeded.")
--- a/Batch/Batch/src/generateMetadataColoringFromAzure.py
+++ b/Batch/Batch/src/generateMetadataColoringFromAzure.py
@ -0,0 +1,201 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import hashlib
+import json
+import datetime
+import pandas as pd
+import os
+import numpy as np
+import requests
+import re
+import traceback
+from itertools import islice
+from browseCloudServiceAuthorizer import BrowseCloudServiceAuthorizer
+from countingGridsHeartBeater import CountingGridsHeartBeater
+from jobStatus import JobStatus
+from batchJob import BatchJob
+import azure.storage.blob as azureblob
+from browseCloudAzureUtilities import upload_file_to_container, download_file_from_container
+import matplotlib.pyplot as plt
+from CountingGridsPy.EngineToBrowseCloudPipeline import BrowseCloudArtifactGenerator, PipelineTimer
+import sys
+sys.path.append("../../..")
+
+if __name__ == "__main__":
+    HEART_BEATER = None
+    try:
+        errStr = "Please give valid command-line arguments."
+        if len(sys.argv) != 5:
+            raise ValueError(errStr)
+
+        containerNameIn = sys.argv[1]  # targetId
+        metadataColumnName = sys.argv[2]  # targetId
+        containerNameOut = sys.argv[3]  # id of current job
+        windowSize = int(sys.argv[4])  # window size
+
+        AUTH_URL = ""
+        SERVICE_URL = ""
+
+        _STORAGE_ACCOUNT_NAME = 'browsecloudmodelfiles'
+        _STORAGE_ACCOUNT_KEY = ''
+
+        jobId = containerNameOut
+        targetId = containerNameIn
+
+        with open("keys.json", "r") as f:
+            data = json.load(f)
+            _SENTIMENT_ANALYSIS_KEY = data['_SENTIMENT_ANALYSIS_KEY']
+
+        with open('metadata.json', 'r') as fMeta:
+            dataMeta = json.load(fMeta)
+            AUTH_URL = dataMeta["AUTH_URL"]
+            if dataMeta['ENV'] == 'DEV':
+                # TODO: Use key vault and certificate to retreive that information
+                # instead of temp file for keys.
+                # Note that keys.json is not checked in.
+                with open("keys.json", "r") as f:
+                    data = json.load(f)
+                    _STORAGE_ACCOUNT_KEY = data['_STORAGE_ACCOUNT_KEY_OUT_DEV']
+                    _STORAGE_ACCOUNT_NAME = data['_STORAGE_ACCOUNT_NAME_OUT_DEV'] if (
+                        '_STORAGE_ACCOUNT_NAME_OUT_DEV' in data
+                     ) else _STORAGE_ACCOUNT_NAME
+                    SERVICE_URL = dataMeta["SERVICE_URL_DEV"] + "/api/v1/jobs/" + \
+                        jobId if "SERVICE_URL_DEV" in dataMeta else SERVICE_URL
+            elif dataMeta['ENV'] == 'PROD':
+                with open("keys.json", "r") as f:
+                    data = json.load(f)
+                    _STORAGE_ACCOUNT_KEY = data['_STORAGE_ACCOUNT_KEY_OUT_PROD']
+                    _STORAGE_ACCOUNT_NAME = data['_STORAGE_ACCOUNT_NAME_OUT_PROD'] if (
+                        '_STORAGE_ACCOUNT_NAME_OUT_PROD' in data
+                     ) else _STORAGE_ACCOUNT_NAME
+                    SERVICE_URL = dataMeta["SERVICE_URL_PROD"] + "/api/v1/jobs/" + \
+                        jobId if "SERVICE_URL_PROD" in dataMeta else SERVICE_URL
+            else:
+                raise ValueError(
+                    "Environment type in metadata.json is invalid.")
+
+        BATCH_JOB = BatchJob(jobId, JobStatus.NotStarted, 0)
+        SERVICE_AUTHORIZER = BrowseCloudServiceAuthorizer(AUTH_URL)
+        HEART_BEATER = CountingGridsHeartBeater(
+            SERVICE_URL, BATCH_JOB, SERVICE_AUTHORIZER)
+
+        # Setup Working Directory.
+        DIRECTORY_SUFFIX = hashlib.sha3_256(
+            (targetId+str(datetime.datetime.now())).encode()).hexdigest()
+        DIRECTORY_DATA = targetId + "_" + DIRECTORY_SUFFIX
+
+        if not os.path.isdir(DIRECTORY_DATA):
+            os.mkdir(DIRECTORY_DATA)
+
+        '''
+        Algorithm:
+            1.  Copy everything over from target container to job container.
+            2.  Separate and properly encode the feature column.
+            3.  Weight documents on grid.
+            3.  Output colors file.
+            5.  Write legend file.
+        '''
+        pTimer = PipelineTimer()
+
+        pTimer("Downloading Target Job Files.")
+
+        # Heart beater to training state.
+        HEART_BEATER.next()
+        HEART_BEATER.next()
+
+        blob_client = azureblob.BlockBlobService(
+            account_name=_STORAGE_ACCOUNT_NAME,
+            account_key=_STORAGE_ACCOUNT_KEY)
+
+        all_blobs = blob_client.list_blobs(containerNameIn)
+
+        for blob in all_blobs:
+            download_file_from_container(
+                blob_client, containerNameIn, DIRECTORY_DATA+"/"+blob.name, blob.name)
+
+        HEART_BEATER.makeProgress(20)
+        pTimer("Getting sentiment from Azure.")
+
+        # Feature Mapping data
+        raw_feature = []
+
+        def dictionary_from_line(row):
+            prop_dict = {}
+            row_components = row.split('\t')
+            for row_component in row_components:
+                row_key_value = row_component.split(':')
+                prop_dict[row_key_value[0]] = row_key_value[1]
+            return prop_dict
+
+        with open(DIRECTORY_DATA+"/database.txt", 'r') as f:
+            for row in f:
+                prop_dict = dictionary_from_line(row)
+                raw_feature.append(prop_dict[metadataColumnName])
+
+        # first, try to convert all values to numbers
+        feature_map = []
+        labelTuples = []
+        cm = None
+        try:
+            feature_map = np.array([float(x) for x in raw_feature])
+            labelTuples = [('{0:.3g}'.format(np.min(feature_map)) + " " + metadataColumnName,
+                            '{0:.3g}'.format(np.max(feature_map)) + " " + metadataColumnName)]
+            feature_map = (feature_map - np.min(feature_map)) / \
+                (np.max(feature_map) - np.min(feature_map))
+            cm = plt.get_cmap('cool')
+        except:
+            # Not all numbers. Try to do a 2-categorical
+            set_of_values = set(raw_feature)
+
+            if len(set_of_values) is not 2:
+                raise ValueError(
+                    "Selected metadata row is neither numerical nor 2-categorical")
+
+            mask = [int(x == raw_feature[0]) for x in raw_feature]
+            feature_map = np.array(mask)
+
+            cm = plt.get_cmap('PiYG')
+            labelTuples = [
+                (list(set_of_values - set([raw_feature[0]]))[0], raw_feature[0])]
+
+        HEART_BEATER.makeProgress(60)
+
+        colorTuples = [(tuple(list(cm(0))[:-1]),
+                        tuple(list(cm(127))[:-1]), tuple(list(cm(255))[:-1]))]
+
+        pTimer("Generating counting grid artifacts.")
+        LEARNED_GRID_FILE_NAME = "/CountingGridDataMatrices.mat"
+        DOCMAP_FILE_NAME = "/docmap.txt"
+        bcag = BrowseCloudArtifactGenerator(DIRECTORY_DATA)
+        bcag.W = [windowSize, windowSize]
+        bcag.read(LEARNED_GRID_FILE_NAME)
+        bcag.read_docmap(DOCMAP_FILE_NAME, engine="numpy")
+        bcag.write_colors(cm=cm, feature_map=feature_map,
+                          stretch_the_truth=True)
+        bcag.write_legends(labelTuples=labelTuples, colorTuples=colorTuples)
+        bcag.add_feature_map_to_database(feature_map)
+
+        HEART_BEATER.makeProgress(80)
+        pTimer("Done.")
+
+        blob_client.create_container(containerNameOut)
+
+        # upload each file, aside from the input file into
+        FILES = [f.path for f in os.scandir(DIRECTORY_DATA) if not f.is_dir()]
+        for modelfile in FILES:
+            upload_file_to_container(blob_client, containerNameOut, modelfile)
+
+        HEART_BEATER.makeProgress(100)
+    except Exception as e:
+        HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
+        print("Script failed.")
+        print(traceback.format_exc())
+    except:
+        HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
+        print("Script failed.")
+        print(traceback.format_exc())
+
+    else:
+        HEART_BEATER.done(success=True)
+        print("Script succeeded.")
--- a/Batch/Batch/src/generateSentimentColoringFromAzure.py
+++ b/Batch/Batch/src/generateSentimentColoringFromAzure.py
@ -0,0 +1,210 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import hashlib
+import json
+import datetime
+import pandas as pd
+import os
+import numpy as np
+import requests
+import re
+import traceback
+from itertools import islice
+from browseCloudServiceAuthorizer import BrowseCloudServiceAuthorizer
+from countingGridsHeartBeater import CountingGridsHeartBeater
+from jobStatus import JobStatus
+from batchJob import BatchJob
+import azure.storage.blob as azureblob
+from browseCloudAzureUtilities import upload_file_to_container, download_file_from_container
+import matplotlib.pyplot as plt
+from CountingGridsPy.EngineToBrowseCloudPipeline import BrowseCloudArtifactGenerator, PipelineTimer
+import sys
+sys.path.append("../../..")
+
+if __name__ == "__main__":
+    HEART_BEATER = None
+    try:
+        errStr = "Please give valid command-line arguments."
+        if len(sys.argv) != 4:
+            raise ValueError(errStr)
+
+        containerNameIn = sys.argv[1]  # targetId
+        containerNameOut = sys.argv[2]  # id of current job
+        windowSize = int(sys.argv[3])  # window size
+
+        AUTH_URL = ""
+        SERVICE_URL = ""
+
+        _STORAGE_ACCOUNT_NAME = ""
+        _STORAGE_ACCOUNT_KEY = ""
+
+        jobId = containerNameOut
+        targetId = containerNameIn
+
+        with open("keys.json", "r") as f:
+            data = json.load(f)
+            _SENTIMENT_ANALYSIS_KEY = data['_SENTIMENT_ANALYSIS_KEY']
+
+        with open('metadata.json', 'r') as fMeta:
+            dataMeta = json.load(fMeta)
+            AUTH_URL = dataMeta["AUTH_URL"]
+            _STORAGE_ACCOUNT_NAME = dataMeta["_STORAGE_ACCOUNT_NAME_MODELS"]
+            SENTIMENT_ANALYSIS_ENDPOINT = dataMeta["SENTIMENT_ANALYSIS_ENDPOINT"]
+            if dataMeta['ENV'] == 'DEV':
+                # TODO: Use key vault and certificate to retreive that information
+                # instead of temp file for keys.
+                # Note that keys.json is not checked in.
+                with open("keys.json", "r") as f:
+                    data = json.load(f)
+                    _STORAGE_ACCOUNT_KEY = data['_STORAGE_ACCOUNT_KEY_OUT_DEV']
+                    _STORAGE_ACCOUNT_NAME = data['_STORAGE_ACCOUNT_NAME_OUT_DEV'] if (
+                        '_STORAGE_ACCOUNT_NAME_OUT_DEV' in data
+                     ) else _STORAGE_ACCOUNT_NAME
+                    SERVICE_URL = dataMeta["SERVICE_URL_DEV"] + "/api/v1/jobs/" + \
+                        jobId if "SERVICE_URL_DEV" in dataMeta else SERVICE_URL
+            elif dataMeta['ENV'] == 'PROD':
+                with open("keys.json", "r") as f:
+                    data = json.load(f)
+                    _STORAGE_ACCOUNT_KEY = data['_STORAGE_ACCOUNT_KEY_OUT_PROD']
+                    _STORAGE_ACCOUNT_NAME = data['_STORAGE_ACCOUNT_NAME_OUT_PROD'] if (
+                        '_STORAGE_ACCOUNT_NAME_OUT_PROD' in data
+                     ) else _STORAGE_ACCOUNT_NAME
+                    SERVICE_URL = dataMeta["SERVICE_URL_PROD"] + "/api/v1/jobs/" + \
+                        jobId if "SERVICE_URL_PROD" in dataMeta else SERVICE_URL
+            else:
+                raise ValueError(
+                    "Environment type in metadata.json is invalid.")
+
+        BATCH_JOB = BatchJob(jobId, JobStatus.NotStarted, 0)
+        SERVICE_AUTHORIZER = BrowseCloudServiceAuthorizer(AUTH_URL)
+        HEART_BEATER = CountingGridsHeartBeater(
+            SERVICE_URL, BATCH_JOB, SERVICE_AUTHORIZER)
+
+        # Setup Working Directory.
+        DIRECTORY_SUFFIX = hashlib.sha3_256(
+            (targetId+str(datetime.datetime.now())).encode()).hexdigest()
+        DIRECTORY_DATA = targetId + "_" + DIRECTORY_SUFFIX
+
+        if not os.path.isdir(DIRECTORY_DATA):
+            os.mkdir(DIRECTORY_DATA)
+
+        '''
+        Algorithm:
+            1.  Copy everything over from target container to job container.
+            2.  Get sentiment analysis results for all documents.
+            3.  Weight documents on grid.
+            3.  Output colors file.
+            5.  Write legend file.
+        '''
+        pTimer = PipelineTimer()
+
+        pTimer("Downloading Target Job Files.")
+
+        # Heart beater to training state.
+        HEART_BEATER.next()
+        HEART_BEATER.next()
+
+        blob_client = azureblob.BlockBlobService(
+            account_name=_STORAGE_ACCOUNT_NAME,
+            account_key=_STORAGE_ACCOUNT_KEY)
+
+        all_blobs = blob_client.list_blobs(containerNameIn)
+
+        for blob in all_blobs:
+            download_file_from_container(
+                blob_client, containerNameIn, DIRECTORY_DATA + "/"+blob.name, blob.name)
+
+        HEART_BEATER.makeProgress(20)
+        pTimer("Getting sentiment from Azure.")
+
+        # Feature Mapping data
+        # Generate body to send to text analytics.
+        base_object = {
+            'documents': []
+        }
+        documents = []
+
+        def chunk(it, size):
+            it = iter(it)
+            return iter(lambda: tuple(islice(it, size)), ())
+
+        def dictionary_from_line(row):
+            prop_dict = {}
+            row_components = row.split('\t')
+            for row_component in row_components:
+                row_key_value = row_component.split(':')
+                prop_dict[row_key_value[0]] = row_key_value[1]
+            return prop_dict
+
+        with open(DIRECTORY_DATA+"/database.txt", 'r') as f:
+            for row in f:
+                prop_dict = dictionary_from_line(row)
+                documents.append({
+                    'id': prop_dict['id'],
+                    # Text analytics has a 5120 character limit.
+                    'text': (prop_dict['title'] + " " + prop_dict['abstract'])[:5120]
+                })
+
+        # Call text analytics API
+        requestHeaders = {
+            'Content-Type': 'application/json',
+            'Ocp-Apim-Subscription-Key': _SENTIMENT_ANALYSIS_KEY
+        }
+
+        # To avoid API size limit, send in chunks of 500
+        feature_map = [None] * len(documents)
+        for chunk in chunk(documents, 500):
+            base_object['documents'] = chunk
+            response = requests.post(
+                SENTIMENT_ANALYSIS_ENDPOINT, json=base_object, headers=requestHeaders)
+
+            # Raise for non 200 response codes
+            response.raise_for_status()
+
+            json_response = response.json()
+            for element in json_response['documents']:
+                feature_map[int(element['id']) - 1] = float(element['score'])
+
+        HEART_BEATER.makeProgress(60)
+
+        # Legend data
+        cm = plt.get_cmap('coolwarm_r')
+        labelTuples = [("Negative Sentiment", "Positive Sentiment")]
+        colorTuples = [(tuple(list(cm(0))[:-1]),
+                        tuple(list(cm(127))[:-1]), tuple(list(cm(255))[:-1]))]
+
+        pTimer("Generating counting grid artifacts.")
+        LEARNED_GRID_FILE_NAME = "/CountingGridDataMatrices.mat"
+        DOCMAP_FILE_NAME = "/docmap.txt"
+        bcag = BrowseCloudArtifactGenerator(DIRECTORY_DATA)
+        bcag.W = [windowSize, windowSize]
+        bcag.read(LEARNED_GRID_FILE_NAME)
+        bcag.read_docmap(DOCMAP_FILE_NAME, engine="numpy")
+        bcag.write_colors(cm=cm, feature_map=feature_map)
+        bcag.write_legends(labelTuples=labelTuples, colorTuples=colorTuples)
+        bcag.add_feature_map_to_database(feature_map)
+
+        HEART_BEATER.makeProgress(80)
+        pTimer("Done.")
+
+        blob_client.create_container(containerNameOut)
+
+        # upload each file, aside from the input file into
+        FILES = [f.path for f in os.scandir(DIRECTORY_DATA) if not f.is_dir()]
+        for modelfile in FILES:
+            upload_file_to_container(blob_client, containerNameOut, modelfile)
+
+        HEART_BEATER.makeProgress(100)
+    except Exception as e:
+        HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
+        print("Script failed.")
+        print(traceback.format_exc())
+    except:
+        HEART_BEATER.done(success=False) if HEART_BEATER is not None else False
+        print("Script failed.")
+        print(traceback.format_exc())
+
+    else:
+        HEART_BEATER.done(success=True)
+        print("Script succeeded.")
--- a/Batch/Batch/src/jobStatus.py
+++ b/Batch/Batch/src/jobStatus.py
@ -0,0 +1,12 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from enum import IntEnum
+
+
+class JobStatus(IntEnum):
+    NotStarted = 0
+    PreProcessing = 1
+    Training = 2
+    Success = 3
+    Failure = 4
--- a/Batch/Batch/src/requirements.txt
+++ b/Batch/Batch/src/requirements.txt
@ -0,0 +1,4 @@
+azure-batch==5.1.1
+azure-storage-blob==1.4.0
+adal==1.2.0
+azure-mgmt-datalake-analytics==0.2.0
--- a/CountingGridsPy/.gitignore
+++ b/CountingGridsPy/.gitignore
@ -0,0 +1,292 @@
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
+
+src/keys.json
+src/metadata.json
+src/metadata_other.json
+
+# User-specific files
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+
+# Visual Studio 2015 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+# NUNIT
+*.VisualState.xml
+TestResult.xml
+
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+**/Properties/launchSettings.json
+
+*_i.c
+*_p.c
+*_i.h
+*.ilk
+*.meta
+*.obj
+*.pch
+*.pdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+
+# Chutzpah Test files
+_Chutzpah*
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+
+# TFS 2012 Local Workspace
+$tf/
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+
+# JustCode is a .NET coding add-in
+.JustCode
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+
+# Web workbench (sass)
+.sass-cache/
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# TODO: Comment the next line if you want to checkin your web deploy settings
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+
+# NuGet Packages
+*.nupkg
+# The packages folder can be ignored because of Package Restore
+**/packages/*
+# except build/, which is used as an MSBuild target.
+!**/packages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/packages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+
+# Microsoft Azure Emulator
+ecf/
+rcf/
+
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!*.[Cc]ache/
+
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+
+# Microsoft Fakes
+FakesAssemblies/
+
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+
+# Typescript v1 declaration files
+typings/
+
+# Visual Studio 6 build log
+*.plg
+
+# Visual Studio 6 workspace options file
+*.opt
+
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+
+# FAKE - F# Make
+.fake/
+
+# JetBrains Rider
+.idea/
+*.sln.iml
+
+# CodeRush
+.cr/
+
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+
+# Telerik's JustMock configuration file
+*.jmconfig
+
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
--- a/CountingGridsPy/EngineToBrowseCloudPipeline/init.py
+++ b/CountingGridsPy/EngineToBrowseCloudPipeline/init.py
@ -0,0 +1,10 @@
+from .morphologicalTightener import MorphologicalTightener
+from .slidingWindowTrainer import SlidingWindowTrainer
+from .browseCloudArtifactGenerator import BrowseCloudArtifactGenerator
+from .cgEngineWrapper import CGEngineWrapper
+from .nlpCleaner import NLPCleaner
+from .pipelineTimer import PipelineTimer
+
+
+__all__ = ['MorphologicalTightener', 'BrowseCloudArtifactGenerator',
+           'CGEngineWrapper', 'NLPCleaner', 'PipelineTimer', 'SlidingWindowTrainer']
--- a/CountingGridsPy/EngineToBrowseCloudPipeline/browseCloudArtifactGenerator.py
+++ b/CountingGridsPy/EngineToBrowseCloudPipeline/browseCloudArtifactGenerator.py
@ -0,0 +1,353 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import numpy as np
+import scipy.io as io
+import pandas as pd
+import matplotlib.pyplot as plt
+from CountingGridsPy.EngineToBrowseCloudPipeline import MorphologicalTightener
+
+
+class BrowseCloudArtifactGenerator(object):
+    def __init__(self, DIRECTORY_DATA):
+        self.DIRECTORY_DATA = DIRECTORY_DATA
+
+    def read(self, LEARNED_GRID_FILE_NAME):
+        MAT = io.loadmat(self.DIRECTORY_DATA + LEARNED_GRID_FILE_NAME)
+        self.pi2_idf = MAT['pi2_idf']
+        self.counts_to_show = MAT['counts_to_show']  # COUNTS MATRIX
+        if type(self.counts_to_show) is not np.ndarray:
+            try:
+                self.counts_to_show = self.counts_to_show.toarray()
+            except Exception as e:
+                raise ValueError(e)
+        # MAPPING AFTER LAYERS CODE Q( Location | Document ) TxE
+        self.ql2 = MAT['ql2']
+        # ARRAY WITH THE LAYER NUMBER FOR EACH DOCUMENT,  argmax over the layers
+        self.id_layer = MAT['id_layer'][0]
+        # LAYERED PI WEIGHTED BY IDF. E1xE2xZxLA
+        self.pi_la_idf = MAT['pi_la_idf']
+        try:
+            # We don't need it for now. Simply 1:T
+            self.indices_to_show = MAT['indices_to_show'][0]
+        except Exception as e:
+            # Not implemented in matlab version
+            self.indices_to_show = np.array(range(MAT['ql2'].shape[0])) + 1
+        del MAT
+
+        cgsz = np.zeros(2)
+        cgsz[0], cgsz[1], self.Z = self.pi2_idf.shape
+        self.cgsz = cgsz.astype(int)
+        self.indexR = np.arange(0, self.cgsz[0]).astype(int)
+        self.indexC = np.arange(0, self.cgsz[1]).astype(int)
+
+    def write_top_pi(self):
+        MAXZ = 80
+        self.pi2_idf = MorphologicalTightener.tighten_pi(self.pi2_idf)
+
+        pi_max = np.argsort(-self.pi2_idf, axis=2)[:, :, :MAXZ]
+        pi_max_vals = -np.sort(-self.pi2_idf, axis=2)[:, :, :MAXZ]
+        missing_words = set(range(self.Z)).difference(set(pi_max.flatten()))
+        locations_missing_words = np.zeros([len(missing_words), 4])
+        for m_id, m in enumerate(missing_words):
+            loc = np.unravel_index(
+                np.argmax(self.pi2_idf[:, :, m]), self.cgsz.astype('int'))
+            locations_missing_words[m_id, :] = [
+                int(m), self.pi2_idf[loc[0], loc[1], m], loc[0], loc[1]]
+
+        with open(self.DIRECTORY_DATA + '/top_pi.txt', 'w') as the_file:
+            for r in self.indexR:
+                for c in self.indexC:
+                    tmp = "row:" + ("%1d" % (r+1)) + "\t" + "col:" + ("%1d" % (c+1)) + "\t" + "\t".join(
+                        ["%1d" % a + ":" + "%1.3f" % b for a, b in zip(pi_max[r, c, :], pi_max_vals[r, c, :])])
+                    if any((locations_missing_words[:, 2] == r) & (locations_missing_words[:, 3] == c)):
+                        tmp = tmp + "\t" + "\t".join([("%1d" % a) + ":" + ("%1.3f" % b) for a, b in locations_missing_words[(
+                            locations_missing_words[:, 2] == r) & (locations_missing_words[:, 3] == c), :2]])
+                    the_file.write(tmp + "\n")
+
+    def write_top_pi_layers(self):
+        no_layers = self.pi_la_idf.shape[3]
+        with open(self.DIRECTORY_DATA + '/top_pi_layers.txt', 'w') as the_file:
+            for layer in range(no_layers):
+                MAXZ = 80
+                self.pi_la_idf[:, :, :, layer] = MorphologicalTightener.tighten_pi(
+                    self.pi_la_idf[:, :, :, layer])
+                pi_max = np.argsort(-self.pi_la_idf[:,
+                                                    :, :, layer], axis=2)[:, :, :MAXZ]
+                pi_max_vals = - \
+                    np.sort(-self.pi_la_idf[:, :, :,
+                                            layer], axis=2)[:, :, :MAXZ]
+                missing_words = set(range(self.Z)).difference(
+                    set(pi_max.flatten()))
+                locations_missing_words = np.zeros([len(missing_words), 4])
+                for m_id, m in enumerate(missing_words):
+                    loc = np.unravel_index(
+                        np.argmax(self.pi_la_idf[:, :, m, layer]), self.cgsz.astype(int))
+                    locations_missing_words[m_id, :] = [
+                        int(m), self.pi_la_idf[loc[0], loc[1], m, layer], loc[0], loc[1]]
+
+                for r in self.indexR:
+                    for c in self.indexC:
+                        tmp = "layer:" + ("%1d" % (layer+1)) + "\t" + "row:" + ("%1d" % (r+1)) + "\t" + "col:" + ("%1d" % (c+1)) + "\t" + "\t".join(
+                            ["%1d" % a + ":" + "%1.3f" % b for a, b in zip(pi_max[r, c, :], pi_max_vals[r, c, :])])
+                        if any((locations_missing_words[:, 2] == r) & (locations_missing_words[:, 3] == c)):
+                            tmp = tmp + "\t" + "\t".join([("%1d" % a) + ":" + ("%1.3f" % b) for a, b in locations_missing_words[(
+                                locations_missing_words[:, 2] == r) & (locations_missing_words[:, 3] == c), :2]])
+                        the_file.write(tmp + "\n")
+
+    def write_database(self, df, keep):
+        dfSave = df.copy()
+
+        dfSave = dfSave[keep]
+        dfSave.reset_index(drop=True, inplace=True)
+        dfSave["id"] = np.arange(len(dfSave)) + 1
+        dfSave["layer"] = self.id_layer
+
+        def format_full_row(row):
+            row_property_strings = []
+            for column_name in set(dfSave.columns):
+                row_property_strings.append(
+                    column_name + ":" + str(row[column_name]))
+            return str.join('\t', row_property_strings) + '\n'
+        databaselist = dfSave.apply(format_full_row, axis=1).tolist()
+        with open(self.DIRECTORY_DATA + '/database.txt', 'w', encoding="utf-8") as the_file:
+            the_file.writelines(databaselist)
+
+    def add_feature_map_to_database(self, feature_map):
+        file_text = ""
+        with open(self.DIRECTORY_DATA + '/database.txt', "r") as f:
+            for index, line in enumerate(f):
+                file_text += line.strip() + '\tfeature:' + \
+                    str(feature_map[index]) + "\n"
+
+        with open(self.DIRECTORY_DATA + '/database.txt', "w") as f:
+            f.writelines(file_text)
+
+    def write_keep(self, keep):
+        with open(self.DIRECTORY_DATA + '/keep.txt', 'w') as the_file:
+            the_file.writelines("\n".join([str(int(x)) for x in keep]))
+
+    def read_docmap(self, fileName, engine="numpy"):
+        if not (engine == "numpy" or engine == "matlab"):
+            raise ValueError("The {} engine does not exist.".format(engine))
+
+        self.ql2 = np.zeros(self.ql2.shape)
+
+        with open(self.DIRECTORY_DATA + fileName) as f:
+            for line in f:
+                arr = line.split("\t")
+                e1Index = int(arr[0].replace("row:", ""))-1
+                e2Index = int(arr[1].replace("col:", ""))-1
+
+                # since there are layers, we have previously pick the max probability over the bold-ith layer
+                i = 2
+                while i < len(arr):
+                    docId, qVal, layer = arr[i].split(":")
+                    docId = int(docId)
+                    qVal = float(qVal)
+                    layer = int(layer)
+                    t = docId - 1
+                    if engine == "matlab":
+                        self.ql2[e1Index, e2Index, t] = qVal
+                    elif engine == "numpy":
+                        self.ql2[t, e1Index, e2Index] = qVal
+                    i += 1
+
+    def write_docmap(self, wd_size, engine="numpy"):
+        docToGridMapping = np.copy(self.ql2)
+
+        if engine == "matlab":
+            pass
+        elif engine == "numpy":
+            docToGridMapping = np.moveaxis(docToGridMapping, 0, -1)
+        else:
+            raise ValueError("The {} engine does not exist.".format(engine))
+
+        thr = 0.01
+        mask = np.zeros(self.cgsz)
+        mask[:wd_size, :wd_size] = 1
+        qlSmooth = np.real(np.fft.ifft2(np.fft.fft2(docToGridMapping, axes=(
+            0, 1)) * np.fft.fft2(np.expand_dims(mask, 2), axes=(0, 1)), axes=(0, 1)))
+        tmp = list()
+        with open(self.DIRECTORY_DATA + '/docmap.txt', 'w') as f:
+            for r in self.indexR:
+                for c in self.indexC:
+                    ids = np.where(qlSmooth[r, c, :] > thr)[0]
+                    vals = qlSmooth[r, c, ids]
+                    lay = self.id_layer[ids]
+
+                    tmp.append("row:" + ("%1d" % (r+1)) + "\tcol:" + ("%1d" % (c+1)) + "\t" + "\t".join(
+                        [str(theid + 1)+":"+str(val)+":"+str(l) for theid, val, l in zip(ids, vals, lay)]) + "\n")
+            f.writelines(tmp)
+
+    def write_correspondences(self, correspondences, vocabulary):
+        '''
+        Correspondences maps the lemmatized words to the original text.
+
+        Example correspondences:
+        {'adopt': ',adopted,adopted,adopted,adopted', 'work': ',work,work,work,work', 'i': ',i,i,i,i,i,i,i,i
+        ', 'wish': ',wish,wish,wish,wish'}
+
+        '''
+        li = list()
+
+        with open(self.DIRECTORY_DATA + '/correspondences.txt', 'w') as the_file:
+            for k, v in correspondences.items():
+                unique_values = list(set([w for w in v.split(",") if w != '']))
+                N = len(unique_values)
+                li = li + list(zip(unique_values, [k]*N))
+
+            tmp = list()
+
+            for w1, w2 in li:
+                try:
+                    i = vocabulary.index(w2)
+                    tmp.append(w1 + "\t" + w2 + "\t" + str(i+1) + "\n")
+                except Exception as e:
+                    pass
+            the_file.writelines(tmp)
+
+    def write_cooccurences(self):
+        raise Exception(
+            "The coccurrences function should not be called because it's not guaranteed to be a correct artifact for BrowseCloud.")
+
+    def write_counts(self):
+        tmp = list()
+        with open(self.DIRECTORY_DATA + '/words.txt', 'w') as the_file:
+            for z in range(self.counts_to_show.shape[0]):
+                docIds = np.where(self.counts_to_show[z, :] != 0)[0]
+                vals = np.array(self.counts_to_show[z, docIds]).flatten()
+                tmp.append("id:"+str(z+1) + "\t" + "\t".join(
+                    [str(i + 1) + ":" + "%1d" % v for i, v in zip(docIds, vals)]) + "\n")
+            the_file.writelines(tmp)
+
+    def write_vocabulary(self, vocabulary):
+        with open(self.DIRECTORY_DATA + '/vocabulary.txt', 'w') as the_file:
+            the_file.writelines(
+                [str(id + 1) + "\t" + str(word) + "\n" for id, word in enumerate(vocabulary)])
+
+    def write_legends(self, colorTuples=None, labelTuples=None):
+        if (colorTuples is not None and labelTuples is not None):
+            with open(self.DIRECTORY_DATA + '/legend.txt', 'w') as the_file:
+                for ct, lt in zip(colorTuples, labelTuples):
+                    r1, g1, b1 = ct[0]
+                    rm, gm, bm = ct[1]
+                    r2, g2, b2 = ct[2]
+                    l1, l2 = lt
+                    data = [l1, r1, g1, b1, rm, gm, bm, l2, r2, g2, b2]
+                    data = [str(x) for x in data]
+                    the_file.write("\t".join(data)+"\n")
+
+    # 0. make sure ql is a distribution over the indices again -
+    # chose not to do this because the final map used for visualization will be screwed up
+    # docToGridMapping/np.sum(np.sum(docToGridMapping,axis=0),axis=0)
+    # 1. Calculate the weighted average between ql and the featuremapping for each index - weights are ql
+    # 2. Do 0,1 normalization of the and multiply by 255 to map the range [0,1] to the range [0,255]
+    # 3. Use this new range to map to the RGB colorscale
+    def mapSentiment(self, docToGridMapping, feature_map, W=[5, 5], doNormalizeQOverGrid=True, stretch_the_truth=False):
+        normalizedDocToGridMapping = None
+        if doNormalizeQOverGrid:
+            normalizedDocToGridMapping = docToGridMapping / \
+                np.sum(docToGridMapping, axis=(0, 1))
+        else:
+            normalizedDocToGridMapping = np.copy(docToGridMapping)
+        e0, e1, T = docToGridMapping.shape
+        # toroidal- top, left, and top left
+        Q = np.pad(normalizedDocToGridMapping, [
+                   (W[0]-1, 0), (W[1]-1, 0), (0, 0)], 'wrap').cumsum(axis=0).cumsum(axis=1)
+
+        # sum area table trick
+        normalizedDocToGridMapping = Q[(
+            W[0]-1):, (W[1]-1):, :] - Q[(W[0]-1):, :e1, :] - Q[:e0, (W[1]-1):, :] + Q[:e0, :e1, :]
+        normalizedDocToGridMapping = np.moveaxis(np.moveaxis(
+            normalizedDocToGridMapping, -1, 0) / np.sum(normalizedDocToGridMapping, axis=-1), 0, -1)
+        sentimentMapping = np.dot(normalizedDocToGridMapping, feature_map)
+
+        weights = None
+        if stretch_the_truth:
+            weights = 255*(sentimentMapping - np.min(sentimentMapping.flatten())) / (np.max(
+                sentimentMapping.flatten()) - np.min(sentimentMapping.flatten()))  # weights between 0 and 256
+        else:
+            weights = 255*(sentimentMapping)
+        return (sentimentMapping, weights)
+
+    def write_colors(self, colors=None, feature_map=None, engine="numpy", cm=None, stretch_the_truth=False):
+        def valid_color_comp(c):
+            return 0.0 < c and c < 1
+        if colors is not None:
+            for color in colors:
+                if len(color) != 3 or not valid_color_comp(color[0]) or not valid_color_comp(color[1]) or not valid_color_comp(color[2]):
+                    raise Exception(
+                        "Invalid RGB color for BrowseCloud input. Must be between 0 and 1 and only 3 dimensions are given.")
+        elif feature_map is not None:
+            colors = [0 for d in range(len(self.indexR)*len(self.indexC))]
+            docToGridMapping = np.copy(self.ql2)
+            if engine == "matlab":
+                pass
+            elif engine == "numpy":
+                # move the first axis to the third
+                docToGridMapping = np.moveaxis(docToGridMapping, 0, -1)
+            else:
+                raise ValueError(
+                    "The {} engine does not exist.".format(engine))
+            W = None
+            if self.W is None:
+                W = [5, 5]
+            else:
+                W = self.W.copy()
+            sentimentMapping, weights = self.mapSentiment(
+                docToGridMapping, feature_map, W, stretch_the_truth=stretch_the_truth)
+
+            if cm is None:
+                cm = plt.get_cmap('PuRd')
+            colors = [(c[0], c[1], c[2])
+                      for c in cm([int(np.round(w)) for w in weights.flatten()])]
+        else:
+            colors = [(1.0, 1.0, 1.0)
+                      for d in range(len(self.indexR)*len(self.indexC))]
+        with open(self.DIRECTORY_DATA + '/colors_browser.txt', 'w') as the_file:
+            for r in self.indexR:
+                for c in self.indexC:
+                    i = len(self.indexR)*r + c
+                    tmp = ("%1d" % (r+1)) + "\t" + ("%1d" % (c+1)) + "\t" + str(
+                        (colors[i][0])) + "\t"+str((colors[i][1])) + "\t" + str((colors[i][2]))
+                    the_file.write(tmp + "\n")
+        return colors
+
+
+if __name__ == "__main__":
+    bcag = BrowseCloudArtifactGenerator("")
+
+    # 3 documents each with sentiment[0,.5,1] from negative to positive
+    # 9x9 grid
+    doc1Q = [
+        [0.25, 0.25, 0],
+        [0.25, 0.25, 0],
+        [0, 0, 0]
+    ]
+    doc2Q = [
+        [0, 0.25, 0.25],
+        [0, 0.25, 0.25],
+        [0, 0, 0]
+    ]
+    doc3Q = [
+        [0, 0, 0],
+        [0, 0.25, 0.25],
+        [0, 0.25, 0.25]
+    ]
+    q = np.array([doc1Q, doc2Q, doc3Q])
+    feature_map = np.array([0, 0.5, 1])
+
+    W = [1, 1]
+
+    qMatlab = np.moveaxis(q, 0, -1)
+
+    Q = np.pad(q, [(0, 0), (W[1]-1, 0), (W[0]-1, 0)],
+               'wrap').cumsum(1).cumsum(2)
+    normalizedDocToGridMapping = Q[:, (W[0]-1):, (W[1]-1):] - \
+        Q[:, (W[0]-1):, :3] - Q[:, :3, (W[1]-1):] + Q[:, :3, :3]
+    print(normalizedDocToGridMapping)
+
+    result = bcag.mapSentiment(qMatlab, feature_map)[0]
+    print('DONE')
+    print(result)
--- a/CountingGridsPy/EngineToBrowseCloudPipeline/cgEngineWrapper.py
+++ b/CountingGridsPy/EngineToBrowseCloudPipeline/cgEngineWrapper.py
@ -0,0 +1,135 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from CountingGridsPy.models import CountingGridModel
+from CountingGridsPy.EngineToBrowseCloudPipeline import SlidingWindowTrainer
+from scipy import io
+import pandas as pd
+import numpy as np
+import scipy as sp
+import os
+from sklearn.feature_extraction.text import CountVectorizer
+
+
+class CGEngineWrapper(object):
+    def __init__(self, extent_size=32, window_size=5, layers=2, heartBeaters=None):
+        self.cg_size = extent_size
+        self.wd_size = window_size
+        self.no_layers = layers
+        self.heartBeaters = heartBeaters
+
+    def ready_the_matlab_engine(self, X, labels_filtered, DIRECTORY_DATA):
+        m, n = X.shape
+        s = X.data
+        i = X.tocoo().row
+        j = X.indices
+        io.savemat(DIRECTORY_DATA + '/counts.mat',
+                   {'m': m, 'n': n, 's': s, 'i': i, 'j': j})
+        if labels_filtered is not None:
+            io.savemat(DIRECTORY_DATA + '/labels.mat',
+                       {'labels': labels_filtered})
+
+    def __fitCountVectorizor(self, DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, labels, MIN_FREQUENCY):
+        df = pd.read_table(DIRECTORY_DATA + CLEAN_DATA_FILE_NAME)
+        df = df[df.columns[1:]]
+        TEXT = df['pos_filtered'].tolist()
+        id_grid = np.array(
+            [i for i, t in enumerate(TEXT) if len(str(t).split(" ")) > 3]
+        )
+        addl_keep = np.zeros(len(TEXT))
+        addl_keep[id_grid] += 1
+        addl_keep = addl_keep.astype(bool)
+        df = df.ix[id_grid].reset_index(drop=True)
+        self.labelsS = np.array(
+            labels)[id_grid] if labels is not None else None
+        vect = CountVectorizer(decode_error="ignore", min_df=MIN_FREQUENCY)
+        X = vect.fit_transform(df['pos_filtered'].tolist())
+        return (vect, X, addl_keep)
+
+    def get_vocab(self, DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, labels, MIN_FREQUENCY, keep):
+        vect, _, addl_keep = self.__fitCountVectorizor(
+            DIRECTORY_DATA,
+            CLEAN_DATA_FILE_NAME,
+            labels,
+            MIN_FREQUENCY
+        )
+        return (vect.get_feature_names(), np.array(keep) & np.array(addl_keep))
+
+    def incremental_fit(
+        self, DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, labels,
+        MIN_FREQUENCY, keep, engine, initial_max_iter=100,
+        w=2000, s=1000, runInitialTrain=True
+    ):
+        vect, X, addl_keep = self.__fitCountVectorizor(
+            DIRECTORY_DATA,
+            CLEAN_DATA_FILE_NAME,
+            labels,
+            MIN_FREQUENCY
+        )
+
+        if engine != "numpy":
+            raise ValueError("Not implemented!")
+
+        extent = np.array([self.cg_size, self.cg_size])
+        window = np.array([self.wd_size, self.wd_size])
+        model = CountingGridModel(extent, window)
+
+        T = X.shape[0]
+        training_max_iter_vec = [1 for x in range(int(np.ceil((T-w)/s)) + 1)]
+        train = model.fit
+        kwargs = {
+            "data": X.toarray(),
+            "max_iter": initial_max_iter,
+            "returnSumSquareDifferencesOfPi": False,
+            "layers": self.no_layers,
+            "noise": .00001,
+            "output_directory": DIRECTORY_DATA
+        }
+
+        print("Iteration vectors:")
+        print("Running for this number of iteration:" +
+              str(len([initial_max_iter] + training_max_iter_vec)))
+        print([initial_max_iter] + training_max_iter_vec)
+
+        SlidingWindowTrainer.SlidingTrainer(
+            w,
+            s,
+            training_max_iter_vec,
+            train,
+            kwargs,
+            runInitialTrain=runInitialTrain
+        )
+
+        return (vect.get_feature_names(), np.array(keep) & np.array(addl_keep))
+
+    def fit(self, DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, labels, MIN_FREQUENCY, keep, engine):
+        vect, X, addl_keep = self.__fitCountVectorizor(
+            DIRECTORY_DATA,
+            CLEAN_DATA_FILE_NAME,
+            labels,
+            MIN_FREQUENCY
+        )
+
+        if engine == "matlab":
+            self.ready_the_matlab_engine(X, labels, DIRECTORY_DATA)
+            os.system(r"cg_exe_buja.exe {0} counts.mat ".format(
+                DIRECTORY_DATA) + str(self.cg_size) + " " + str(self.wd_size) + " " + str(self.no_layers))
+        elif engine == "numpy":
+            extent = np.array([self.cg_size, self.cg_size])
+            window = np.array([self.wd_size, self.wd_size])
+            model = CountingGridModel(extent, window)
+            model.fit(
+                X.toarray(),
+                max_iter=100,
+                returnSumSquareDifferencesOfPi=False,
+                layers=self.no_layers,
+                noise=.00000001,
+                output_directory=DIRECTORY_DATA,
+                heartBeaters=self.heartBeaters
+            )
+        elif engine == "torch":
+            raise ValueError("Not implemented yet.")
+        else:
+            raise ValueError("The {} engine does not exist.".format(engine))
+
+        return (vect.get_feature_names(), np.array(keep) & np.array(addl_keep))
--- a/CountingGridsPy/EngineToBrowseCloudPipeline/dumpCountingGrids.py
+++ b/CountingGridsPy/EngineToBrowseCloudPipeline/dumpCountingGrids.py
@ -0,0 +1,102 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from CountingGridsPy.EngineToBrowseCloudPipeline import BrowseCloudArtifactGenerator, CGEngineWrapper, NLPCleaner, PipelineTimer
+import sys
+import numpy as np
+import os
+import pandas as pd
+import datetime
+import matplotlib.pyplot as plt
+from CountingGridsPy.models import CountingGridModel
+
+pTimer = PipelineTimer()
+# Example CLI: python dumpCountingGrids.py CountingGridInput_MarchSurvey 24 5 matlabEngine traditionalInput channelDump.csv
+errStr = '''
+Please give a valid command-line arguments.
+Instructions found here: https://github.com/microsoft/browsecloud/wiki/Data-Pipeline-Documentation.
+'''
+# ---------------------------------------------------------------------------------------
+# Input
+# ---------------------------------------------------------------------------------------
+if len(sys.argv) != 7:
+    print(((sys.argv)))
+    raise ValueError(errStr)
+DIRECTORY_DATA = sys.argv[1]
+EXTENT_SIZE = int(sys.argv[2])
+WINDOW_SIZE = int(sys.argv[3])
+engine_type = sys.argv[4]
+inputfile_type = sys.argv[5]
+inputfile_name = sys.argv[6]
+if engine_type != "numpyEngine" and engine_type != "matlabEngine":
+    raise ValueError(
+        "The {0} engine does not exist. Please use 'matlabEngine' or 'numpyEngine'.".format(engine_type))
+engine_type = engine_type[:-6]  # 6 characters in the word "Engine"
+if inputfile_type != "metadataInput" and inputfile_type != "simpleInput":
+    raise ValueError(
+        "The {0} input type does not exist. Please use 'simpleInput' or 'metadataInput'.".format(inputfile_type))
+inputfile_type = inputfile_type[:-5]  # remove "Input"
+
+if not os.path.isdir(DIRECTORY_DATA):
+    raise ValueError(
+        "Undefined local directory where digital channel is dumped!\n" + errStr)
+
+FILE_NAME = DIRECTORY_DATA + "\\" + inputfile_name
+
+CLEAN_DATA_FILE_NAME, MIN_FREQUENCY, MIN_WORDS = ["\cg-processed.tsv", 2, 5]
+
+# ---------------------------------------------------------------------------------------
+# Data Cleaning
+# ---------------------------------------------------------------------------------------
+
+cleaner = NLPCleaner()
+correspondences = None
+CACHED_CORRESPONDENCES_FILE_NAME = "\cached_correspondences.tsv"
+pTimer("Reading data file.")
+df, keep = cleaner.read(FILE_NAME, inputfile_type, MIN_FREQUENCY, MIN_WORDS)
+if not (os.path.exists(DIRECTORY_DATA + CACHED_CORRESPONDENCES_FILE_NAME) and os.path.exists(DIRECTORY_DATA + CLEAN_DATA_FILE_NAME)):
+    pTimer("Starting data cleaning.")
+    cleaner.handle_negation_tokens()
+    cleaner.removePunctuation()
+    correspondences = cleaner.lemmatize()
+    cleaner.write_cached_correspondences(
+        DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
+    cleaner.write(DIRECTORY_DATA, CLEAN_DATA_FILE_NAME)
+else:
+    pTimer("Skipping data cleaning.")
+    correspondences = cleaner.read_cached_correspondences(
+        DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
+
+pTimer("Learning counting grid.")
+LEARNED_GRID_FILE_NAME = "/CountingGridDataMatrices.mat"
+engine = CGEngineWrapper(extent_size=EXTENT_SIZE, window_size=WINDOW_SIZE)
+vocabulary = None
+
+# ---------------------------------------------------------------------------------------
+# Learning
+# ---------------------------------------------------------------------------------------
+if not os.path.exists(DIRECTORY_DATA + LEARNED_GRID_FILE_NAME):
+    vocabulary, keep = engine.fit(DIRECTORY_DATA, CLEAN_DATA_FILE_NAME,
+                                  cleaner.labelsS, MIN_FREQUENCY, keep, engine=engine_type)
+else:
+    vocabulary, keep = engine.get_vocab(
+        DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, cleaner.labelsS, MIN_FREQUENCY, keep)
+
+# ---------------------------------------------------------------------------------------
+# Output
+# ---------------------------------------------------------------------------------------
+pTimer("Generating counting grid artifacts.")
+LINK_FILE_NAME = ""
+bcag = BrowseCloudArtifactGenerator(DIRECTORY_DATA)
+bcag.read(LEARNED_GRID_FILE_NAME)
+bcag.write_docmap(engine.wd_size, engine=engine_type)
+bcag.write_counts()
+bcag.write_vocabulary(vocabulary)
+bcag.write_top_pi()
+bcag.write_top_pi_layers()
+bcag.write_colors()  # write default blue colors
+bcag.write_database(df, keep)
+bcag.write_correspondences(correspondences, vocabulary)
+bcag.write_keep(keep)
+
+pTimer("Done.")
--- a/CountingGridsPy/EngineToBrowseCloudPipeline/dumpSlidingWindowCountingGrids.py
+++ b/CountingGridsPy/EngineToBrowseCloudPipeline/dumpSlidingWindowCountingGrids.py
@ -0,0 +1,111 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from CountingGridsPy.EngineToBrowseCloudPipeline import BrowseCloudArtifactGenerator, CGEngineWrapper, NLPCleaner, PipelineTimer
+import sys
+import numpy as np
+import os
+import pandas as pd
+import datetime
+import matplotlib.pyplot as plt
+from CountingGridsPy.models import CountingGridModel
+
+pTimer = PipelineTimer()
+# Example CLI: python dumpSlidingWindowCountingGrids.py CalculatorDataExperiment6 24 5 numpyEngine simpleTimeInput CalculatorUIFData.txt
+errStr = '''
+Please give a valid command-line arguments.
+Instructions found here: https://github.com/microsoft/browsecloud/wiki/Data-Pipeline-Documentation.
+'''
+
+# ---------------------------------------------------------------------------------------
+# Input
+# ---------------------------------------------------------------------------------------
+
+if len(sys.argv) != 7:
+    print(((sys.argv)))
+    raise ValueError(errStr)
+DIRECTORY_DATA = sys.argv[1]
+EXTENT_SIZE = int(sys.argv[2])
+WINDOW_SIZE = int(sys.argv[3])
+engine_type = sys.argv[4]
+inputfile_type = sys.argv[5]
+inputfile_name = sys.argv[6]
+if engine_type != "numpyEngine":
+    raise ValueError("The {0} engine does not exist.".format(engine_type))
+engine_type = engine_type[:-6]  # 6 characters in the word "Engine"
+if inputfile_type != "simpleTimeInput":
+    raise ValueError(
+        "The {0} input type does not exist.".format(inputfile_type))
+inputfile_type = inputfile_type[:-5]  # remove "Input"
+
+if not os.path.isdir(DIRECTORY_DATA):
+    raise ValueError(
+        "Undefined local directory where digital channel is dumped!\n" + errStr)
+
+FILE_NAME = DIRECTORY_DATA + "\\" + inputfile_name
+
+CLEAN_DATA_FILE_NAME, MIN_FREQUENCY, MIN_WORDS = ["\cg-processed.tsv", 2, 5]
+
+# ---------------------------------------------------------------------------------------
+# Data Cleaning
+# ---------------------------------------------------------------------------------------
+cleaner = NLPCleaner()
+correspondences = None
+CACHED_CORRESPONDENCES_FILE_NAME = "\cached_correspondences.tsv"
+pTimer("Reading data file.")
+df, keep = cleaner.read(FILE_NAME, inputfile_type, MIN_FREQUENCY, MIN_WORDS)
+if not (os.path.exists(DIRECTORY_DATA + CACHED_CORRESPONDENCES_FILE_NAME) and os.path.exists(DIRECTORY_DATA + CLEAN_DATA_FILE_NAME)):
+    pTimer("Starting data cleaning.")
+    cleaner.handle_negation_tokens()
+    cleaner.removePunctuation()
+    correspondences = cleaner.lemmatize()
+    cleaner.write_cached_correspondences(
+        DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
+    cleaner.write(DIRECTORY_DATA, CLEAN_DATA_FILE_NAME)
+else:
+    pTimer("Skipping data cleaning.")
+    correspondences = cleaner.read_cached_correspondences(
+        DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME)
+
+pTimer("Learning counting grid.")
+LEARNED_GRID_FILE_NAME = "/CountingGridDataMatrices.mat"
+engine = CGEngineWrapper(extent_size=EXTENT_SIZE, window_size=WINDOW_SIZE)
+vocabulary = None
+try:
+    # ---------------------------------------------------------------------------------------
+    # Learning
+    # ---------------------------------------------------------------------------------------
+    if not os.path.exists(DIRECTORY_DATA + LEARNED_GRID_FILE_NAME):
+        vocabulary, keep = engine.incremental_fit(DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, cleaner.labelsS,
+                                                  MIN_FREQUENCY, keep, engine=engine_type, initial_max_iter=100, w=2000, s=1, runInitialTrain=True)
+    else:
+        vocabulary, keep = engine.get_vocab(
+            DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, cleaner.labelsS, MIN_FREQUENCY, keep)
+    pTimer("Generating counting grid artifacts.")
+
+    dataFolders = [DIRECTORY_DATA] + \
+        [f.path for f in os.scandir(DIRECTORY_DATA) if f.is_dir()]
+    # we know a priori it should be [".\\","\\iter0","./iter1",...]
+    dataFolders.sort()
+except Exception as e:
+    print(e)
+finally:
+    # ---------------------------------------------------------------------------------------
+    # Output
+    # ---------------------------------------------------------------------------------------
+    for i, folder in enumerate(dataFolders):
+        LINK_FILE_NAME = ""
+        bcag = BrowseCloudArtifactGenerator(folder)
+        if os.path.exists(folder + LEARNED_GRID_FILE_NAME):
+            bcag.read(LEARNED_GRID_FILE_NAME)
+            bcag.write_docmap(engine.wd_size, engine=engine_type)
+            bcag.write_counts()
+            bcag.write_vocabulary(vocabulary)
+            bcag.write_top_pi()
+            bcag.write_top_pi_layers()
+            bcag.write_colors()  # write default blue colors
+            bcag.write_correspondences(correspondences, vocabulary)
+            if i == 0:
+                bcag.write_database(df, keep)
+                bcag.write_keep(keep)
+    pTimer("Done.")
--- a/CountingGridsPy/EngineToBrowseCloudPipeline/morphologicalTightener.py
+++ b/CountingGridsPy/EngineToBrowseCloudPipeline/morphologicalTightener.py
@ -0,0 +1,102 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from skimage.morphology import label
+from skimage.measure import regionprops
+import numpy as np
+
+
+class MorphologicalTightener():
+    @staticmethod
+    def tighten_pi(pi):
+        '''
+        Algorithm:
+        1. Go through each word in the vocabulary stored in pi params.
+        2. Find all connected components within the
+        the 2-d plane for the given feature, which is like an image.
+        3. Within each component find the index with highest probability.
+        4. Set the probability of all other elements
+        in the component set to be 0.
+        '''
+        pi = np.copy(pi)
+        Z = pi.shape[2]
+        for z in range(Z):
+            tmp_pi = pi[:, :, z]
+            labelled_components = label(
+                tmp_pi > 1e-3,
+                neighbors=None,
+                background=None,
+                return_num=False,
+                connectivity=1
+            )
+            for region in regionprops(labelled_components):
+                x_coords = region.coords[:, 0]
+                y_coords = region.coords[:, 1]
+                vals = tmp_pi[x_coords, y_coords]
+                i_max = np.argmax(vals)
+                maxval = tmp_pi[np.array(region.coords[i_max, :][0]), np.array(
+                    region.coords[i_max, :][1])]
+                tmp_pi[x_coords, y_coords] = 0
+                tmp_pi[np.array(region.coords[i_max, :][0]), np.array(
+                    region.coords[i_max, :][1])] = maxval
+
+            pi[:, :, z] = tmp_pi
+        return pi
+
+    @staticmethod
+    def tighten_q():
+        pass
+
+    @staticmethod
+    def print_results(input, output):
+        assert(np.all(input.shape == output.shape))
+        for z in range(input.shape[2]):
+            print("Feature index " + str(z) + ":")
+            print("Input:")
+            print(input[:, :, z])
+            print("Output:")
+            print(output[:, :, z])
+            print("")
+
+
+if __name__ == "__main__":
+    # test1
+    INPUT = np.array(
+        [
+            [
+                [0.5, 0.5, 0, 0],
+                [0.7, 0, 0, 0.5],
+                [0.5, 0, 0, 0.6],
+                [0.5, 0, 0.5, 0]
+            ],
+            [
+                [0.004, 0.5, 0, 0],
+                [0.5, 0, 0, 0],
+                [100, 0, 0, 0],
+                [0, 0, 0, 0.5]
+            ]
+        ]).astype(float).transpose((1, 2, 0))
+
+    OUTPUT = np.array(
+        [
+            [
+                [0, 0, 0, 0],
+                [0.7, 0, 0, 0],
+                [0, 0, 0, 0.6],
+                [0, 0, 0.5, 0]
+            ],
+            [
+                [0, 0, 0, 0],
+                [0, 0, 0, 0],
+                [100, 0, 0, 0],
+                [0, 0, 0, 0.5]
+            ]
+        ]
+    ).astype(float).transpose((1, 2, 0))
+
+    assert(np.all(MorphologicalTightener.tighten_pi(INPUT) == OUTPUT))
+
+    MorphologicalTightener.print_results(
+        INPUT,
+        MorphologicalTightener.tighten_pi(INPUT)
+    )
--- a/CountingGridsPy/EngineToBrowseCloudPipeline/nlpCleaner.py
+++ b/CountingGridsPy/EngineToBrowseCloudPipeline/nlpCleaner.py
@ -0,0 +1,207 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import re
+import functools
+import string
+from collections import defaultdict
+import nltk
+from nltk.stem import WordNetLemmatizer
+from nltk.corpus import wordnet
+from nltk.tag.stanford import StanfordPOSTagger
+import pandas as pd
+import os
+import numpy as np
+from CountingGridsPy.EngineToBrowseCloudPipeline.surveyData import SurveyData
+from sklearn.preprocessing import LabelEncoder
+from sklearn.feature_extraction import stop_words
+
+
+def tokenizer(x, wordnet_lemmatizer, corrispondences, tagger):
+    try:
+        def get_wordnet_pos(treebank_tag):
+            if treebank_tag.startswith('J'):
+                return wordnet.ADJ
+            elif treebank_tag.startswith('V'):
+                return wordnet.VERB
+            elif treebank_tag.startswith('N') or treebank_tag.startswith('FW'):
+                return wordnet.NOUN
+            elif treebank_tag.startswith('R'):
+                return wordnet.ADV
+            else:
+                return wordnet.NOUN
+
+        # tagged = tagger.tag(x.split()) #e.g. [('what', 'WP')]
+        tagged = [(d, 'VB') if d not in stop_words.ENGLISH_STOP_WORDS else (
+            d, '') for d in x.split()]  # stopword removal here
+        tagged = [(wordnet_lemmatizer.lemmatize(re.sub("[^\w\d]", "", t[0]), get_wordnet_pos(t[1])), t[1], t[0])
+                  for t in tagged]  # the regular expression removes characters that aren't digits or alphanumeric characters
+        # tagged - (lemmatized word, part of speech, original word) e.g. [('what', 'WP', 'what')
+
+        for t in tagged:
+            corrispondences[t[0]] = corrispondences.get(t[0], "") + "," + t[2]
+
+        return [t[0] for t in tagged], [t[1] for t in tagged]
+    except Exception as e:
+        print(e)
+        return [], []
+
+
+class NLPCleaner(object):
+    def __init__(self):
+        self.corrispondences = dict()
+        self.textS = None
+        self.labelsS = None
+
+    def initial_clean_after_read(self, df, MIN_WORDS):
+        self.text = [(
+                str(aTuple[0]) + " " + str(aTuple[1]) + " " + str(aTuple[1]) + " " + str(aTuple[1])
+            ) if not pd.isnull(aTuple[0]) and not pd.isnull(aTuple[1]) else (
+                str(aTuple[1]) if not pd.isnull(aTuple[1]) else (
+                    str(aTuple[0]) if not pd.isnull(aTuple[0]) else ""
+                )
+            ) for aTuple in zip(df.title.tolist(), df.abstract.tolist())]
+        text_extremely_cleaned = [a for a in map(lambda x: re.sub("\\b[A-Za-z]{1,3}\\b", " ", re.sub(
+            " +", " ", re.sub("[^A-Za-z\\s]", " ", re.sub("\[[^\]]*\]", " ", x)))), self.text)]
+        keep = np.array([True if len(doc.split()) >=
+                         MIN_WORDS else False for doc in text_extremely_cleaned])
+        self.textS = [re.sub(" +", " ", re.sub("\|", " ", re.sub("\[[^\]]*\]",
+                                                                 " ", t.lower()))) if k else "" for k, t in zip(keep, self.text)]
+        return keep
+
+    def read(self, FILE_NAME, inputfile_type, MIN_FREQUENCY=2, MIN_WORDS=5):
+        df = None
+
+        if inputfile_type == "metadata":
+            df = pd.read_csv(FILE_NAME, keep_default_na=False)
+            if 'title' not in df:
+                df['title'] = ''
+            if 'abstract' not in df:
+                df['abstract'] = ''
+
+        elif inputfile_type == "simple":
+            df = pd.read_csv(FILE_NAME, sep="\t", header=None,
+                             keep_default_na=False)
+            df.columns = ["title", "abstract", "link"]
+
+        elif inputfile_type == "simpleTime":
+            df = pd.read_csv(FILE_NAME, sep="\t", header=None,
+                             keep_default_na=False)
+            df.columns = ["title", "abstract", "link", "time"]
+
+        else:
+            raise ValueError(
+                "Input_type " + str(inputfile_type) + " is not valid.")
+
+        # Validation now happens in BrowseCloud.Service. Assuming correct input, so no validation needed.
+        keep = self.initial_clean_after_read(df, MIN_WORDS)
+        '''
+        le = LabelEncoder()
+        labels = le.fit_transform(df.Corpus.tolist())
+        self.labelsS = [t for k,t in zip(keep,labels) if k] #not used
+        '''
+
+        return (df, np.copy(keep))
+
+    def handle_negation_tokens(self):
+        negation_handling_functions = [lambda x: re.sub(" +", " ", x),
+                                       lambda x: re.sub(
+            "(^|\W|\.)can[\W']*[o]*(t|not)(\W|$|\.)", " can not ", x),
+            lambda x: re.sub(
+            "(^|\W|\.)could[n]*[\W']*[no]*(t|not)(\W|$|\.)", " could not ", x),
+            lambda x: re.sub(
+            "(^|\W|\.)should[n]*[\W']*[no]*(t|not)(\W|$|\.)", " should not ", x),
+            lambda x: re.sub(
+            "(^|\W|\.)would[n]*[\W']*[no]*(t|not)(\W|$|\.)", " would not ", x),
+            lambda x: re.sub(
+            "(^|\W|\.)won[\W']*(t)(\W|$|\.)", " wont ", x),
+            lambda x: re.sub(
+            "(^|\W|\.)do[n]*[\W']*[no]*t(\W|$|\.)", " do not ", x),
+            lambda x: re.sub(
+            "(^|\W|\.)does[n]*[\W']*[no]*t(\W|$|\.)", " does not ", x),
+            lambda x: re.sub(
+            "(^|\W|\.)did[n]*[\W']*[no]*t(\W|$|\.)", " did not ", x),
+            lambda x: re.sub(
+            "(^|\W|\.)is[n]*[\W']*[no]*t(\W|$|\.)", " is not ", x),
+            lambda x: re.sub(
+            "(^|\W|\.)are[n]*[\W']*[no]*t(\W|$|\.)", " are not ", x),
+            lambda x: re.sub(
+            "(^|\W|\.)was[n]*[\W']*[no]*t(\W|$|\.)", " was not ", x),
+            lambda x: re.sub(
+            "(^|\W|\.)were[n]*[\W']*[no]*t(\W|$|\.)", " were not ", x),
+            lambda x: re.sub(
+            "(^|\W|\.)ain[\W']*[no]*t(\W|$|\.)", " aint ", x),
+            lambda x: re.sub(
+            "(^|\W|\.)had[n]*[\W']*[no]*(t|not)(\W|$|\.)", " had not ", x),
+            lambda x: re.sub(
+            "(^|\W|\.)has[n]*[\W']*[no]*(t|not)(\W|$|\.)", " has not ", x),
+            lambda x: re.sub(
+            "(^|\W|\.)have[n]*[\W']*[no]*(t|not)(\W|$|\.)", " have not ", x),
+            lambda x: x.lower()]
+
+        def compose(*functions):
+            return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
+        normalize_negations = compose(*negation_handling_functions)
+        self.textS = [a for a in map(
+            lambda x: normalize_negations(x), self.textS)]
+
+    def removePunctuation(self):
+        regexPunkt = re.compile('[%s]' % re.escape(string.punctuation))
+        self.textSb = [a for a in map(lambda x: re.sub(
+            " +", " ", regexPunkt.sub(' ', x)), self.textS)]
+
+    def lemmatize(self):
+        path_to_model = "./stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger"
+        path_to_jar = "./stanford-postagger-full-2017-06-09/stanford-postagger.jar"
+        # Keep the constructor call here as a comment, just in case. We removed the HMM in tokenizer for part of speech tagging.
+        # StanfordPOSTagger(path_to_model, path_to_jar); tagger.java_options='-mx10G'
+        tagger = None
+        wordnet_lemmatizer = WordNetLemmatizer()
+        self.cleaned_featurized = [a for a in map(lambda x: tokenizer(
+            x, wordnet_lemmatizer, self.corrispondences, tagger) if x != "" else ([], []), self.textSb)]
+        return self.corrispondences
+
+    def write(self, DIRECTORY_DATA, CLEAN_DATA_FILE_NAME, start=None, finish=None):
+        assert(len(self.text) == len(self.cleaned_featurized))
+        if start is None or finish is None:
+            start = 0
+            finish = len(self.text)
+
+        df = pd.DataFrame(
+            columns=['original', 'cleaned', 'pos', 'pos_filtered'])
+        df['original'] = [w for w in map(
+            lambda x: x.replace("\t", " "), self.text[start:finish])]
+
+        try:  # if correspondences runs
+            df['cleaned'] = [" ".join(d[0]).replace("\t", " ")
+                             for d in self.cleaned_featurized[start:finish]]
+            df['pos'] = [" ".join(d[1]).replace("\t", " ")
+                         for d in self.cleaned_featurized[start:finish]]
+            df['pos_filtered'] = [" ".join([w for (w, t) in zip(d[0], d[1]) if t in (
+                ['JJR', 'JJS', 'JJ', 'NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WRB', 'FW']
+            )]) for d in self.cleaned_featurized[start:finish]]
+        except Exception as e:
+            df['cleaned'] = ["" for w in self.text[start:finish]]
+            df['pos'] = ["" for w in self.text[start:finish]]
+            df['pos_filtered'] = ["" for w in self.text[start:finish]]
+        if not os.path.isdir(DIRECTORY_DATA):
+            os.mkdir(DIRECTORY_DATA)
+
+        df.to_csv(DIRECTORY_DATA + CLEAN_DATA_FILE_NAME,
+                  sep="\t", encoding="utf-8")
+
+    def write_cached_correspondences(self, DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME):
+        df = pd.DataFrame(columns=['lemma', 'words'])
+        df['lemma'] = [lemma for lemma in self.corrispondences]
+        df['words'] = [self.corrispondences[lemma]
+                       for lemma in self.corrispondences]
+        df.to_csv(DIRECTORY_DATA + CACHED_CORRESPONDENCES_FILE_NAME,
+                  sep="\t", encoding="utf-8")
+
+    def read_cached_correspondences(self, DIRECTORY_DATA, CACHED_CORRESPONDENCES_FILE_NAME):
+        df = pd.read_csv(
+            DIRECTORY_DATA + CACHED_CORRESPONDENCES_FILE_NAME, sep="\t", encoding="utf-8")
+        correspondences = dict()
+        for key, val in zip(df['lemma'], df['words']):
+            correspondences[key] = val
+        return correspondences
--- a/CountingGridsPy/EngineToBrowseCloudPipeline/pipelineTimer.py
+++ b/CountingGridsPy/EngineToBrowseCloudPipeline/pipelineTimer.py
@ -0,0 +1,46 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import datetime
+import time
+
+
+class PipelineTimer():
+    def __init__(self):
+        self.last_s = time.time()
+        self.init_time = self.last_s
+
+    def __call__(self, description):
+        curr_s = time.time()
+        self.printSectionDescription(description)
+        self.printCurrentTime()
+        self.printPrettyTime(curr_s - self.last_s)
+        self.last_s = curr_s
+        if description == "Done.":
+            self.prettyPrintTotal(curr_s - self.init_time)
+
+    def printSectionDescription(self, description):
+        print(description)
+
+    def prettyPrintHelper(self, s: float):
+        hours = s // 3600
+        s -= (hours * 3600)
+        minutes = s//60
+        s -= (minutes * 60)
+        seconds = s
+        return [hours, minutes, seconds]
+
+    def printPrettyTime(self, s: float):
+        hours, minutes, seconds = self.prettyPrintHelper(s)
+        string = 'Previous section took {}h {}m {}s.'.format(
+            int(hours), int(minutes), float(int(seconds*100))/100)
+        print(string)
+
+    def printCurrentTime(self):
+        print(datetime.datetime.now())
+
+    def prettyPrintTotal(self, s: float):
+        hours, minutes, seconds = self.prettyPrintHelper(s)
+        string = 'Entire program took {}h {}m {}s.'.format(
+            int(hours), int(minutes), float(int(seconds*100))/100)
+        print(string)
--- a/CountingGridsPy/EngineToBrowseCloudPipeline/slidingWindowTrainer.py
+++ b/CountingGridsPy/EngineToBrowseCloudPipeline/slidingWindowTrainer.py
@ -0,0 +1,44 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import numpy as np
+import os
+
+
+class SlidingWindowTrainer():
+    @staticmethod
+    def SlidingTrainer(w: int, s: int, training_max_iter_vec: list, train, kwargs: dict, runInitialTrain=True):
+        '''
+        Assume: data is sorted by date
+
+        '''
+        print("Learning Initial Grid.")
+        pi = None
+        if runInitialTrain:
+            pi = train(**kwargs)
+        assert("data" in kwargs and "max_iter" in kwargs and "output_directory" in kwargs)
+        data = kwargs['data']
+        root_dir = kwargs["output_directory"].replace(
+            ".", "").replace("/", "").replace("\\", "")
+        T = len(data)
+        assert(w < T)
+        max_sliding_window_size = int(np.ceil((T-w)/s) + 1)
+        assert(len(training_max_iter_vec) == max_sliding_window_size)
+        first_index = 0
+        last_index = min(w, T)
+        for i in range(max_sliding_window_size):
+            kwargs['max_iter'] = training_max_iter_vec[i]
+            kwargs['data'] = data[first_index:last_index]
+            kwargs['output_directory'] = "./" + root_dir + "/iter" + str(i)
+            kwargs['pi'] = pi
+
+            if not (os.path.exists(kwargs['output_directory']) and os.path.isdir(kwargs['output_directory'])):
+                os.mkdir(kwargs['output_directory'])
+
+            print("Learning grid window from first index: " +
+                  str(first_index) + " to second index: " + str(last_index))
+            pi = train(**kwargs)
+
+            last_index = min(last_index + s, T)
+            first_index = min(first_index + s, T)
+        assert(last_index >= T)
--- a/CountingGridsPy/EngineToBrowseCloudPipeline/surveyData.py
+++ b/CountingGridsPy/EngineToBrowseCloudPipeline/surveyData.py
@ -0,0 +1,16 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+
+class SurveyData(object):
+    def __init__(self, alias, title, abstract, responseId, surveyId, link, image):
+        self.alias = alias
+        self.title = title
+        self.abstract = abstract
+        self.responseId = responseId
+        self.surveyId = surveyId
+        self.link = link
+        self.image = image
+
+    def toCols(self):
+        return [colname for colname in self.__dict__]
--- a/CountingGridsPy/init.py
+++ b/CountingGridsPy/init.py
--- a/CountingGridsPy/models/CountingGridModel.py
+++ b/CountingGridsPy/models/CountingGridModel.py
@ -0,0 +1,441 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import math
+import os
+import numpy as np
+import scipy.io
+import scipy.stats
+
+np.random.seed(0)
+# Two functions for use by users of the library:
+# 1.
+# fit(data,.) - Fits the counting grids model to the data set. if layers >1,
+# then learned a hierarchical layered counting grid.
+
+# 2.
+# predict_probabilities(bagofwordcounts,k)) - computes the likelihood,
+# the probability of the data given the parameters and choice of window
+
+
+class CountingGridModel():
+    def __init__(self, extent, window):
+        """
+        extent is a 1-D array of size D in the paper.
+        D is often 2, since it makes the model easily visualizable.
+        window is a one-dimensional of size D in the paper.
+        """
+
+        # E in the paper
+        self.extent = extent
+        # W in the paper
+        self.window = window
+        self.D = len(extent)
+        # Each word is some value between 1 & the size of vocabulary
+        self.vocabulary = np.array([])
+
+        self.extent_volume = 1
+        self.window_volume = 1
+        for v in self.window:
+            self.window_volume *= v
+        for v in self.extent:
+            self.extent_volume *= v
+
+        self.capacity = self.extent_volume / self.window_volume
+
+    # Assumes:  self.pi, self.q,self.extent are set properly
+    def cg_layers(self, data, L, noise=1e-10):
+        T, Z = data.shape
+        pi_la = np.zeros([self.extent[0], self.extent[1], Z, L])
+        h_la = np.zeros([self.extent[0], self.extent[1], Z, L])
+
+        # Modifies: h_la
+        def layer_compute_h(pi_la, h_la):
+            h = self.compute_h(pi_la[:, :, :, l], self.window)
+            h_la[:, :, :, l] = np.transpose(
+                np.transpose(h) / np.transpose(np.sum(h, axis=2))
+            )
+
+        # Add noise to pi
+        for l in range(L):
+            pi_la[:, :, :, l] = (
+                self.pi + np.random.uniform(size=self.pi.shape)*noise
+            )
+            # Normalize
+            pi_la[:, :, :, l] = np.transpose(
+                np.transpose(pi_la[:, :, :, l]) /
+                np.transpose(np.sum(pi_la[:, :, :, l], axis=2))
+            )
+            layer_compute_h(pi_la, h_la)
+        P = np.prod(self.extent)
+
+        # lq/lql is the log of q
+        # lq/lql is shaped as ([P,T])
+        # self.q is stored as TxE1xE2, which is why we
+        def normalize_q(lq):
+            lqlmax = np.amax(lql, axis=0)
+            precompute_lql_minus_max = lql - lqlmax
+            # qla has the same formula, essentially, but not in the log form
+            Lq = np.reshape(
+                precompute_lql_minus_max -
+                np.log(np.sum(np.exp(precompute_lql_minus_max), axis=0)),
+                [self.extent[0], self.extent[1], T]
+            )
+            return np.moveaxis(np.exp(Lq), 2, 0)
+
+        def update_summand(self, toroidal_F, M, N):
+            A = toroidal_F.cumsum(axis=0).cumsum(axis=1)
+            c = np.pad(
+                A,
+                pad_width=tuple([(1, 0) for x in list(self.extent)] + [(0, 0)]), mode='constant', constant_values=0
+            )
+            w0 = self.window[0]
+            w1 = self.window[1]
+            return (
+                c[slice(w0, toroidal_F.shape[0]+1), slice(w1, toroidal_F.shape[1]+1), :] -
+                c[slice(0, self.extent[0]), slice(w1, toroidal_F.shape[1] + 1), :] -
+                c[slice(w0, toroidal_F.shape[0]+1), slice(0, self.extent[1]), :] +
+                c[slice(0, self.extent[0]), slice(0, self.extent[1]), :]
+            )[slice(0, toroidal_F.shape[0]), slice(0, toroidal_F.shape[1]), :]
+
+        # Adding noise to q, which was not originally in the matlab code
+        self.q = self.q + 0.25*noise
+        lql = np.log(np.reshape(np.moveaxis(self.q, 0, -1), (P, T)))
+        self.q = normalize_q(lql)
+        qlsm = qlsm = np.fft.ifft2(np.fft.fft2(
+            np.reshape(self.q, (T, P))
+        )).real.astype(np.float64)
+        lql = np.log(np.reshape(np.moveaxis(self.q, 0, -1), (P, T)))
+
+        # Check the distribution properties of ql
+        '''
+        assert( np.all(np.isclose(np.sum(self.q,axis=0)[:,:],1 ) )==True)
+        assert(np.any((self.q) <0 ) == False)
+        '''
+
+        alpha = 1e-10
+        miter = 1
+
+        SCALING_FACTOR = 2.5
+        # ~1/2 the average number of number of word per document, make this number smaller as the counting grid gets bigger
+        pseudocounts = np.mean(np.sum(data, axis=1)) / (P*SCALING_FACTOR)
+        # This is the posterior for each layer Q( Layer | document ).
+        qla = np.ones([L, T])
+        lqla = np.log(qla)
+
+        plal = np.ones([L, P]) / L  # P(layer | position in counting grid)
+        dirichlet_prior = np.ones([L, Z])
+
+        # Didn't implement the fft to smooth the posterior probabilities
+        # of picking a location, differing from previous code by choice.
+        every_iter = 1
+        nmax = 2  # maximum number of iterations
+        start_ql = 1
+
+        minp = 1e-10
+        TOROID_ARGUMENTS = [(self.window[0], 0), (self.window[1], 0), (0, 0)]
+        eps = (np.finfo(h_la.dtype).eps)
+        for iter in range(nmax):
+            # assert( np.all(np.isclose(np.sum(np.reshape(np.transpose(self.q),(P,T)),axis=0),1 ) ))
+            # assert(np.any((self.q) <0 ) == False)
+            # assert(np.all(np.isclose(np.sum(qla,axis=0),1 )))
+            # assert(np.any((qla) <0 ) == False)
+
+            if iter >= start_ql:
+                lql = np.zeros([P, T])
+                for l in range(L):
+                    tmp = np.reshape(np.log(eps + h_la[:, :, :, l]), [P, Z])
+                    # qla is the Q(layer) in the mean field posterior factorization, its structure is L,T
+                    lql = lql + np.dot(tmp, np.transpose(data))*qla[l, :]
+                    self.q = np.reshape(
+                        normalize_q(lql),
+                        [T, self.extent[0], self.extent[1]]
+                    )
+
+                    # Didn't use the smoothened ql
+                    qlsm = np.fft.ifft2(np.fft.fft2(
+                        np.reshape(self.q, (T, P))
+                    )).real.astype(np.float64)
+
+            # update Q(layer)
+            tmpq = np.reshape(np.moveaxis(np.copy(self.q), 0, -1), [P, T])
+            for l in range(L):
+                tmp = np.reshape(np.log(alpha + h_la[:, :, :, l]), [P, Z])
+                lqla[l, :] = np.sum(
+                    tmpq*(np.dot(tmp, np.transpose(data)) +
+                          np.reshape(np.transpose(np.log(plal[l, :])), [P, 1])),
+                    axis=0
+                )
+
+            lqlamax = np.amax(lqla, axis=0)
+            qla = np.exp(
+                (lqla - lqlamax) - np.log(np.sum(np.exp((lqla - lqlamax)), axis=0))
+            )
+
+            # M-STEP. Basically the normal CG M-Step, repeated #Layers times.
+            # Reimplemented it to include the dirichlet prior in the mix
+            for l in range(L):
+                # Dirichlet prior. Does very little in practice, may be useful only in the early stages of learning... leave it.
+                tmpdirip = dirichlet_prior[l, :] - 1
+                tmpdirip[np.isnan(tmpdirip)] = 0
+
+                # Recall that self.q is T x E1 x E2 tensor
+                # Original Matlab code here for reference:
+                #
+                # nrm = bsxfun(@plus, reshape(tmpdirip,[1 1 Z]),
+                # reshape(  reshape( padarray( ql, W, 'circular','pre'),
+                # [prod(cg_size+W),T])*bsxfun( @times, WD, qla(l,:))', [ cg_size+W,Z ]));
+                first = np.reshape(np.pad(np.moveaxis(
+                    self.q, 0, -1), TOROID_ARGUMENTS, 'wrap'), [np.prod(self.extent+self.window), T])
+                # using transpose function make sense here because we're just swapping 2 Dimensions
+                D = np.dot(first, np.transpose(np.transpose(data) * qla[l, :]))
+                nrm = np.reshape(tmpdirip, [1, 1, Z]) + np.reshape(
+                    D, [self.extent[0]+self.window[0], self.extent[1]+self.window[1], Z])
+
+                QH = nrm / \
+                    np.pad(h_la[:, :, :, l] + np.prod(self.window) * alpha, TOROID_ARGUMENTS, 'wrap')
+                QH = QH[slice(1, self.extent[0]+self.window[0]),
+                        slice(1, self.extent[1]+self.window[1]), :]
+                QH = update_summand(self, QH, T, Z)
+                QH[QH < 0] = 0
+
+                un_pi = pseudocounts + QH*(pi_la[:, :, :, l]+alpha)
+                mask = np.sum(un_pi, 2) != 0
+                nmask = np.sum(un_pi, 2) == 0
+                M1 = np.transpose(np.transpose(
+                    un_pi)*np.transpose(mask.astype(np.float64)) / np.transpose(np.sum(un_pi, 2)))
+                M2 = np.ones([Z, self.extent[0], self.extent[1]]) * \
+                    (nmask).astype(np.float64)
+                pi_la[:, :, :, l] = M1 + np.moveaxis((1.0/Z) * M2, 0, 2)
+                layer_compute_h(pi_la, h_la)
+
+            qlsm = np.fft.ifft2(np.fft.fft2(np.reshape(
+                self.q, (T, P)))).real.astype(np.float64)
+            A = np.sum(qlsm, axis=0)
+            if np.any(np.isclose(A, 0)):
+                A += eps
+            plal = np.transpose(np.reshape(np.sum(np.reshape(np.moveaxis(
+                qlsm, 0, -1), [self.extent[0], self.extent[1], T, 1]) * np.reshape(np.transpose(qla), [1, 1, T, L]), axis=2), [P, L]))/A
+            plal[plal < 1e-100] = 1e-100
+            plal = plal / np.sum(plal, axis=0)  # sum over the layers
+
+        INVERSE_DOCUMENT_FREQUENCY = np.log(
+            data.shape[0] + eps) - np.log(np.sum((data > 0).astype(np.float64), axis=0) + eps)
+
+        pi_la_idf = np.zeros(pi_la.shape)
+        for l in range(L):
+            pi_la_idf[:, :, :, l] = pi_la[:, :, :, l] * \
+                INVERSE_DOCUMENT_FREQUENCY
+
+        id_layers = np.argmax(qla, axis=0) + 1
+        wg_ep = eps
+        mask = np.pad(np.ones(self.window), [
+                      (0, x) for x in self.extent-self.window], 'constant', constant_values=0)
+        wg = np.zeros([self.extent[0], self.extent[1], L])
+        for l in range(L):
+            idl = np.where(id_layers-1 == l)[0]  # id_layer in matlab format
+            for t in range(len(idl)):
+                wg[:, :, l] = wg[:, :, l] + np.fft.ifft2(np.fft.fft2(mask)*np.fft.fft2(
+                    self.q[idl[t], :, :])).real.astype(np.float64)  # m x E structure
+
+        # This makes wg not a distribution.
+        wg = np.transpose(np.transpose(
+            wg) / (np.sum(np.transpose(wg), axis=0) + wg_ep))
+        # sum over the layers
+        pi2_idf = np.sum(
+            pi_la_idf*np.reshape(wg, [self.extent[0], self.extent[1], 1, L]), axis=3)
+
+        # Renormalize Pi after using inverse document frequency.
+        pi2_idf = np.transpose(np.transpose(pi2_idf) /
+                               np.transpose(np.sum(pi2_idf, axis=2)))
+
+        return {
+            "pi2_idf": pi2_idf, "pi_la_idf": pi_la_idf, "id_layer": [id_layers],
+            "ql2": self.q, "counts_to_show": np.transpose(data), "indices_to_show": [np.array(list(range(T))) + 1],
+            "pi": self.pi, "pi_la": pi_la
+        }
+
+    def fit(
+        self, data, max_iter=100, returnSumSquareDifferencesOfPi=False,
+        noise=.000001, learn_pi=True, pi=None, layers=1, output_directory="./", heartBeaters=None
+    ):
+        """
+        Implements variational expectation maximization for the Counting Grid model
+        Assumes: data is an m x n matrix
+        TO DO: return early if fitness converges. don't just run it for max iter.
+        """
+
+        if not os.path.exists(str(output_directory)):
+            raise Exception(
+                "output_directory does not exist for counting grids trainer."
+            )
+
+        def SSD(pi, piHat):
+            A = np.abs(pi - piHat)
+            return np.sum(A * A)
+        alpha = 1e-10
+        SSDPi = []
+        data = data.astype(np.float64)
+        if pi is None:
+            self.initializePi(data)
+        else:
+            self.pi = pi
+        self.h = self.compute_h(self.pi, self.window)
+        self.check_model()
+        extentProduct = np.prod(self.extent)
+        T, _ = data.shape
+
+        pseudocounts = np.mean(np.sum(data, axis=1) / extentProduct) / 2.5
+
+        # q is an m x dim(extent) structure
+        qshape = [len(data)]
+        for v in self.extent:
+            qshape.append(v)
+        self.q = np.zeros(tuple(qshape))
+        i = 0
+        while i < max_iter:
+            # E-Step
+            self.q = self.q_update(data)
+
+            # M-Step
+            if learn_pi:
+                if returnSumSquareDifferencesOfPi:
+                    pi = self.pi
+
+                self.pi = self.pi_update(data, pseudocounts, alpha)
+                if returnSumSquareDifferencesOfPi:
+                    piHat = self.pi
+                    SSDPi.append(SSD(pi, piHat))
+                self.h = self.compute_h(self.pi, self.window)
+            i = i + 1
+            [(h.makeProgress(int(100*i/max_iter)) if h is not None else False)
+             for h in heartBeaters] if heartBeaters is not None else False
+
+        if layers > 1:
+            self.layercgdata = self.cg_layers(data, L=layers, noise=noise)
+            scipy.io.savemat(str(output_directory) +
+                             "/CountingGridDataMatrices.mat", self.layercgdata)
+        else:
+            scipy.io.savemat(str(output_directory) +
+                             "/CGData.mat", {"pi": self.pi, "q": self.q})
+        return self.pi
+
+    # assumptions that we need for the model to be valid
+    def check_model(self):
+        assert(len(self.extent) == len(self.window))
+        for v in self.extent:
+            assert(int(v) == v)
+            assert(v > 0)
+        for v in self.window:
+            assert(int(v) == v)
+            assert(v > 0)
+
+        # dimensions of pi is one more than the window
+        assert(self.pi.ndim - 1 == len(self.window))
+
+    # w0 and w1 are window size
+    def compute_h_noLoopFull(self, PI, w0, w1):
+        return PI[w0:, w1:, :] - PI[:-w0, w1:, :] - PI[w0:, :-w1, :] + PI[:-w0, :-w1, :]
+
+    # no side effects
+    def compute_h(self, pi, W):
+        PI = np.pad(pi, [(0, W[0]), (0, W[1]), (0, 0)],
+                    'wrap').cumsum(axis=0).cumsum(axis=1)
+        PI = np.pad(PI, [(1, 0), (1, 0), (0, 0)], 'constant')
+        w0 = W[0]
+        w1 = W[1]
+        cumsum_output = self.compute_h_noLoopFull(PI, w0, w1)
+        return np.moveaxis(np.moveaxis(cumsum_output[:-1, :-1, :], 2, 0)/np.sum(cumsum_output[:-1, :-1, :], axis=2), 0, -1)
+
+    # How to initialize pi
+    # Note that we don't want pi to be 0, since our update equations depend on a multiplication by pi
+    def initializePi(self, data, technique="uniform"):
+        if technique is "uniform":
+            size = [x for x in self.extent]
+            size.append(data.shape[1])
+            self.pi = np.random.random(size=tuple(size)).astype(np.float64)
+        else:
+            raise ValueError("No initialize strategy given")
+
+    def pi_update(self, data, pseudocounts, alpha):
+        '''
+        Modifies: pi
+        Assumes: self.q has been initialized (in the fit function) and that h has been initialized
+        Assumes: a two dimensional extent
+        Recall: q is M x E tensor
+        '''
+        T, Z = data.shape
+        W = self.window
+        # QdotConH is called nrm in matlab engine, but padding is done beforehand in matlab
+        QdotConH = np.dot(np.moveaxis(self.q, 0, -1,), data)
+        QH = np.pad(QdotConH / (self.h + np.prod(self.window)*alpha),
+                    [(W[0], 0), (W[1], 0), (0, 0)], 'wrap').cumsum(axis=0).cumsum(axis=1)
+        w0 = W[0]
+        w1 = W[1]
+        QH = self.compute_h_noLoopFull(QH, w0, w1)
+        QH[QH < 0] = 0
+
+        un_pi = pseudocounts + QH*(self.pi+alpha)
+        mask = (np.sum(un_pi, axis=2) != 0).astype(np.float64)
+        not_mask = (np.sum(un_pi, axis=2) == 0).astype(np.float64)
+        denom = np.sum(un_pi, axis=2)
+        self.pi = np.transpose(np.transpose(mask)*(np.transpose(un_pi) / np.transpose(denom))) + \
+            (1.0/Z) * np.transpose(np.transpose(
+                np.ones([self.extent[0], self.extent[1], Z]))*np.transpose(not_mask))
+        return self.pi
+
+    def get_indices_for_window_indexed_by_k(self, k, z):
+        indices = [[]]*len(self.extent)
+        for j, v in enumerate(indices):
+            indices[j] = slice(k[j], k[j] + self.window[j])
+        indices.append(z)  # the z index in the paper
+        return tuple(indices)
+
+    def get_h(self):
+        return self.h
+
+    def q_update(self, data):
+        L = np.prod(self.extent)
+        lql = np.dot(np.log(self.h).reshape(
+            (L, data.shape[1])), np.transpose(data))
+        lqlmax = np.amax(lql, axis=0)
+        min_prob = 1.0/(10*L)
+        Lq = ((lql-lqlmax)-np.log(np.sum(np.exp(lql-lqlmax), axis=0))
+              ).reshape(tuple(list(self.extent) + [data.shape[0]]))
+        q = np.exp(Lq)
+        q[q < min_prob] = min_prob
+        q = q / np.sum(np.sum(q, axis=0), axis=0)
+        return np.moveaxis(q, 2, 0)
+
+    # Assumes: bagofwordcounts are integers
+    def predict_probabilities(self, bagofwordcounts, k):
+        assert(len(k) == len(self.window))
+
+        log_p = 0  # log of the products is the sum of the logs
+        for (i, count) in enumerate(bagofwordcounts):
+            if count > 0:
+                indices = self.get_indices_for_window_indexed_by_k(k, i)
+                log_p += np.log(np.sum(self.pi[indices].reshape(
+                    self.pi[indices].size)))*float(count)
+
+        # 1/normalizationfactor * probability
+        # e ^ (log(prob) - log(normalization factor))
+        return math.pow(
+            math.e,
+            log_p - np.sum(bagofwordcounts)*np.sum(np.log(self.window))
+        )
+
+
+if __name__ == "__main__":
+    data = np.array([
+        [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
+        [1, 2, 1, 0, 1, 0, 0, 0, 0, 0],
+        [0, 1, 1, 0, 0, 1, 1, 0, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 1]
+    ])
+    extent = np.array([3, 3])
+    window = np.array([2, 2])
+    model = CountingGridModel(extent, window)
+    model.fit(data, max_iter=1000,
+              returnSumSquareDifferencesOfPi=False, layers=2)
--- a/CountingGridsPy/models/init.py
+++ b/CountingGridsPy/models/init.py
@ -0,0 +1,3 @@
+from .CountingGridModel import CountingGridModel
+
+__all__ = ['CountingGridModel']
--- a/CountingGridsPy/tests/test_models/test_small.py
+++ b/CountingGridsPy/tests/test_models/test_small.py
@ -0,0 +1,29 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import unittest
+import numpy as np
+from CountingGridsPy.models import CountingGridModel
+
+
+class TestCorrectnessOfNontrivialDesignMatrix(unittest.TestCase):
+    def setUp(self):
+        M, N = [5, 7]
+        self.N = N
+        self.data = np.array(
+            list(range(1, 8))+list(range(7, 0, -1)) +
+            [1]*7+[0, 1, 0, 1, 0, 1, 0]+list(range(1, 8))
+        ).reshape((M, N))
+        # note: after one iteration h distribution is the same regardless of position on matrix or window size
+        self.extent = np.array([5, 5])
+        window = np.array([2, 3])
+        self.pi_init = np.ones([5]*2+[N])/1000
+        self.model = CountingGridModel(self.extent, window)
+
+    def test_correct_data(self):
+        assert(sum(sum(self.data)) == 94)
+
+    def test_fitted_model(self):
+        self.model.fit(self.data, max_iter=1,
+                       returnSumSquareDifferencesOfPi=False, pi=np.copy(self.pi_init))
+        assert(np.all(np.isclose(self.model.q, .04)))
--- a/pipeline.yaml
+++ b/pipeline.yaml
@ -0,0 +1,20 @@
+- job:
+
+  pool:
+    vmImage: 'ubuntu-16.04'
+  strategy:
+    matrix:
+      Python36:
+        python.version: '3.6'
+
+  steps:
+  - task: UsePythonVersion@0
+    displayName: 'Use Python $(python.version)'
+    inputs:
+      versionSpec: '$(python.version)'
+
+  - script: pip install pep8
+    displayName: 'Install pep8 linter'
+
+  - script: pep8 . --max-line-length=150
+    displayName: 'Run pep8 linter'